From 460d9e8dd6ea86e11a00f01e7e09c6ad73bcd3dd Mon Sep 17 00:00:00 2001 From: Virgil Date: Wed, 1 Apr 2026 23:08:59 +0000 Subject: [PATCH] fix(reversal): include contraction auxiliaries in fallback signals Co-Authored-By: Virgil --- reversal/tokeniser.go | 19 ++++++++++++++----- reversal/tokeniser_test.go | 12 ++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 7ed8189..730e64b 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -546,11 +546,7 @@ func (t *Tokeniser) buildSignalIndex() { t.verbAux[core.Lower(w)] = true } } else { - for _, w := range []string{ - "is", "are", "was", "were", "has", "had", "have", - "do", "does", "did", "will", "would", "could", "should", - "can", "may", "might", "shall", "must", - } { + for _, w := range defaultVerbAuxiliaries() { t.verbAux[w] = true } } @@ -564,6 +560,19 @@ func (t *Tokeniser) buildSignalIndex() { } } +func defaultVerbAuxiliaries() []string { + return []string{ + "am", "is", "are", "was", "were", + "has", "had", "have", + "do", "does", "did", + "will", "would", "could", "should", + "can", "may", "might", "shall", "must", + "don't", "can't", "won't", "shouldn't", "couldn't", "wouldn't", + "doesn't", "didn't", "isn't", "aren't", "wasn't", "weren't", + "hasn't", "hadn't", "haven't", + } +} + func defaultWeights() map[string]float64 { return map[string]float64{ "noun_determiner": 0.35, diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 762041d..44fae39 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -694,6 +694,18 @@ func TestTokeniser_Disambiguate_ContractionAux(t *testing.T) { } } +func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) { + tok := NewTokeniserForLang("zz") + tokens := tok.Tokenise("don't run the tests") + // The hardcoded fallback auxiliaries should still recognise contractions + // even when no locale grammar data is loaded. + for _, token := range tokens { + if token.Lower == "run" && token.Type != TokenVerb { + t.Errorf("'run' after \"don't\": Type = %v, want TokenVerb", token.Type) + } + } +} + func TestTokeniser_WithSignals_Breakdown(t *testing.T) { setup(t) tok := NewTokeniser(WithSignals())