feat(reversal): add negation disambiguation signal

Co-Authored-By: Virgil <virgil@lethean.io>
2026-04-02 00:53:16 +00:00 · 2026-04-02 00:53:16 +00:00 · 1e3b86ffdf
commit 1e3b86ffdf
parent 7c502f3da0
7 changed files with 104 additions and 13 deletions
--- a/go.sum
+++ b/go.sum
@ -15,14 +15,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
-github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
-golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
 golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
-golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
--- a/grammar.go
+++ b/grammar.go
@ -106,6 +106,9 @@ func mergeSignalData(dst *SignalData, src SignalData) {
 	if len(src.VerbInfinitive) > 0 {
 		dst.VerbInfinitive = append(dst.VerbInfinitive, src.VerbInfinitive...)
 	}
+	if len(src.VerbNegation) > 0 {
+		dst.VerbNegation = append(dst.VerbNegation, src.VerbNegation...)
+	}
 	if len(src.Priors) == 0 {
 		return
 	}
@ -139,6 +142,7 @@ func grammarDataHasContent(data *GrammarData) bool {
 	if len(data.Signals.NounDeterminers) > 0 ||
 		len(data.Signals.VerbAuxiliaries) > 0 ||
 		len(data.Signals.VerbInfinitive) > 0 ||
+		len(data.Signals.VerbNegation) > 0 ||
 		len(data.Signals.Priors) > 0 {
 		return true
 	}
--- a/loader.go
+++ b/loader.go
@ -192,6 +192,15 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
 						}
 					}
 				}
+				if vn, ok := v["verb_negation"]; ok {
+					if arr, ok := vn.([]any); ok {
+						for _, item := range arr {
+							if s, ok := item.(string); ok {
+								grammar.Signals.VerbNegation = append(grammar.Signals.VerbNegation, core.Lower(s))
+							}
+						}
+					}
+				}
 				if priors, ok := v["prior"].(map[string]any); ok {
 					loadSignalPriors(grammar, priors)
 				}
--- a/loader_test.go
+++ b/loader_test.go
@ -169,6 +169,7 @@ func TestFlattenWithGrammar(t *testing.T) {
 						"noun": 0.75,
 					},
 				},
+				"verb_negation": []any{"not", "never"},
 			},
 			"article": map[string]any{
 				"indefinite": map[string]any{
@ -243,6 +244,9 @@ func TestFlattenWithGrammar(t *testing.T) {
 	if grammar.Number.ThousandsSep != "," {
 		t.Errorf("number.thousands = %q, want ','", grammar.Number.ThousandsSep)
 	}
+	if len(grammar.Signals.VerbNegation) != 2 || grammar.Signals.VerbNegation[0] != "not" || grammar.Signals.VerbNegation[1] != "never" {
+		t.Errorf("verb negation not extracted: %+v", grammar.Signals.VerbNegation)
+	}

 	// Articles extracted
 	if grammar.Articles.IndefiniteDefault != "a" {
@ -291,6 +295,7 @@ func TestMergeGrammarData(t *testing.T) {
 			NounDeterminers: []string{"the"},
 			VerbAuxiliaries: []string{"will"},
 			VerbInfinitive:  []string{"to"},
+			VerbNegation:    []string{"not"},
 			Priors: map[string]map[string]float64{
 				"run": {
 					"verb": 0.7,
@ -326,6 +331,7 @@ func TestMergeGrammarData(t *testing.T) {
 			NounDeterminers: []string{"a"},
 			VerbAuxiliaries: []string{"can"},
 			VerbInfinitive:  []string{"go"},
+			VerbNegation:    []string{"never"},
 			Priors: map[string]map[string]float64{
 				"run": {
 					"noun": 0.3,
@ -365,7 +371,7 @@ func TestMergeGrammarData(t *testing.T) {
 	if data.Punct.LabelSuffix != " !" || data.Punct.ProgressSuffix != "..." {
 		t.Errorf("punctuation not merged correctly: %+v", data.Punct)
 	}
-	if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 {
+	if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 || len(data.Signals.VerbNegation) != 2 {
 		t.Errorf("signal slices not merged correctly: %+v", data.Signals)
 	}
 	if got := data.Signals.Priors["run"]["verb"]; got != 0.7 {
@ -374,6 +380,9 @@ func TestMergeGrammarData(t *testing.T) {
 	if got := data.Signals.Priors["run"]["noun"]; got != 0.3 {
 		t.Errorf("signal priors missing merged value: got %v", got)
 	}
+	if data.Signals.VerbNegation[0] != "not" || data.Signals.VerbNegation[1] != "never" {
+		t.Errorf("signal negation not merged correctly: %+v", data.Signals.VerbNegation)
+	}
 	if data.Number.ThousandsSep != "." || data.Number.DecimalSep != "." || data.Number.PercentFmt != "%s%%" {
 		t.Errorf("number format not merged correctly: %+v", data.Number)
 	}
@ -393,7 +402,8 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) {
 					"signal": {
 						"noun_determiner": ["el"],
 						"verb_auxiliary": ["va"],
-						"verb_infinitive": ["a"]
+						"verb_infinitive": ["a"],
+						"verb_negation": ["no", "nunca"]
 					},
 					"number": { "thousands": ".", "decimal": ",", "percent": "%s %%"}
 				}
@ -419,6 +429,9 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) {
 	if len(data.Signals.NounDeterminers) != 1 || data.Signals.NounDeterminers[0] != "el" {
 		t.Errorf("signals not loaded: %+v", data.Signals)
 	}
+	if len(data.Signals.VerbNegation) != 2 || data.Signals.VerbNegation[0] != "no" || data.Signals.VerbNegation[1] != "nunca" {
+		t.Errorf("negation signal not loaded: %+v", data.Signals.VerbNegation)
+	}
 	if data.Number.DecimalSep != "," || data.Number.ThousandsSep != "." {
 		t.Errorf("number format not loaded: %+v", data.Number)
 	}
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -99,6 +99,7 @@ type Tokeniser struct {
 	nounDet     map[string]bool    // signal: noun determiners
 	verbAux     map[string]bool    // signal: verb auxiliaries
 	verbInf     map[string]bool    // signal: infinitive markers
+	verbNeg     map[string]bool    // signal: negation cues
 	withSignals bool               // allocate SignalBreakdown on ambiguous tokens
 	weights     map[string]float64 // signal weights (F3: configurable)
 }
@ -112,7 +113,7 @@ func WithSignals() TokeniserOption {
 }

 // WithWeights overrides the default signal weights for disambiguation.
-// All 7 signal keys must be present; omitted keys silently disable those signals.
+// All signal keys must be present; omitted keys silently disable those signals.
 func WithWeights(w map[string]float64) TokeniserOption {
 	return func(t *Tokeniser) { t.weights = w }
 }
@ -521,6 +522,7 @@ func (t *Tokeniser) buildSignalIndex() {
 	t.nounDet = make(map[string]bool)
 	t.verbAux = make(map[string]bool)
 	t.verbInf = make(map[string]bool)
+	t.verbNeg = make(map[string]bool)

 	data := i18n.GetGrammarData(t.lang)

@ -558,6 +560,18 @@ func (t *Tokeniser) buildSignalIndex() {
 	} else {
 		t.verbInf["to"] = true
 	}
+
+	if data != nil && len(data.Signals.VerbNegation) > 0 {
+		for _, w := range data.Signals.VerbNegation {
+			t.verbNeg[core.Lower(w)] = true
+		}
+	} else {
+		// Keep the fallback conservative: these are weak cues, not hard
+		// negation parsing.
+		for _, w := range []string{"not", "never"} {
+			t.verbNeg[w] = true
+		}
+	}
 }

 func defaultVerbAuxiliaries() []string {
@ -577,6 +591,7 @@ func defaultWeights() map[string]float64 {
 	return map[string]float64{
 		"noun_determiner":   0.35,
 		"verb_auxiliary":    0.25,
+		"verb_negation":     0.05,
 		"following_class":   0.15,
 		"sentence_position": 0.10,
 		"verb_saturation":   0.10,
@ -976,7 +991,7 @@ func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
 	}
 }

-// scoreAmbiguous evaluates 7 weighted signals to determine whether an
+// scoreAmbiguous evaluates 8 weighted signals to determine whether an
 // ambiguous token should be classified as verb or noun.
 func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, []SignalComponent) {
 	var verbScore, nounScore float64
@ -1010,7 +1025,25 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 		}
 	}

-	// 3. following_class: next token's class informs this token's role
+	// 3. verb_negation: preceding negation weakly signals a verb
+	if w, ok := t.weights["verb_negation"]; ok && idx > 0 {
+		prev := tokens[idx-1]
+		if t.verbNeg[prev.Lower] || t.hasNoLongerBefore(tokens, idx) {
+			verbScore += w * 1.0
+			if t.withSignals {
+				reason := "preceded by '" + prev.Lower + "'"
+				if t.hasNoLongerBefore(tokens, idx) {
+					reason = "preceded by 'no longer'"
+				}
+				components = append(components, SignalComponent{
+					Name: "verb_negation", Weight: w, Value: 1.0, Contrib: w,
+					Reason: reason,
+				})
+			}
+		}
+	}
+
+	// 4. following_class: next token's class informs this token's role
 	if w, ok := t.weights["following_class"]; ok && idx+1 < len(tokens) {
 		next := tokens[idx+1]
 		if next.Type != tokenAmbiguous {
@ -1036,7 +1069,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 		}
 	}

-	// 4. sentence_position: first token in sentence → verb signal (imperative)
+	// 5. sentence_position: first token in sentence → verb signal (imperative)
 	if w, ok := t.weights["sentence_position"]; ok && idx == 0 {
 		verbScore += w * 1.0
 		if t.withSignals {
@ -1047,7 +1080,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 		}
 	}

-	// 5. verb_saturation: if a confident verb already exists in the same clause
+	// 6. verb_saturation: if a confident verb already exists in the same clause
 	if w, ok := t.weights["verb_saturation"]; ok {
 		if t.hasConfidentVerbInClause(tokens, idx) {
 			nounScore += w * 1.0
@ -1060,7 +1093,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 		}
 	}

-	// 6. inflection_echo: another token shares the same base in inflected form
+	// 7. inflection_echo: another token shares the same base in inflected form
 	if w, ok := t.weights["inflection_echo"]; ok {
 		echoVerb, echoNoun := t.checkInflectionEcho(tokens, idx)
 		if echoNoun {
@ -1085,7 +1118,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 		}
 	}

-	// 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
+	// 8. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
 	if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
 		verbScore += priorVerb
 		nounScore += priorNoun
@ -1114,6 +1147,13 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 	return verbScore, nounScore, components
 }

+func (t *Tokeniser) hasNoLongerBefore(tokens []Token, idx int) bool {
+	if idx < 2 {
+		return false
+	}
+	return tokens[idx-2].Lower == "no" && tokens[idx-1].Lower == "longer"
+}
+
 func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
 	data := i18n.GetGrammarData(t.lang)
 	if data == nil || len(data.Signals.Priors) == 0 {
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -764,6 +764,34 @@ func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) {
 	}
 }

+func TestTokeniser_Disambiguate_NegationSignal(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser(WithSignals())
+
+	tokens := tok.Tokenise("no longer commit the changes")
+	if len(tokens) < 3 {
+		t.Fatalf("Tokenise(%q) returned %d tokens, want at least 3", "no longer commit the changes", len(tokens))
+	}
+
+	commitTok := tokens[2]
+	if commitTok.Type != TokenVerb {
+		t.Fatalf("'commit' after 'no longer': Type = %v, want TokenVerb", commitTok.Type)
+	}
+	if commitTok.Signals == nil {
+		t.Fatal("'commit' after 'no longer' should have signal breakdown")
+	}
+	foundNegation := false
+	for _, component := range commitTok.Signals.Components {
+		if component.Name == "verb_negation" {
+			foundNegation = true
+			break
+		}
+	}
+	if !foundNegation {
+		t.Error("verb_negation signal should have fired for 'no longer commit'")
+	}
+}
+
 func TestTokeniser_WithSignals_Breakdown(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser(WithSignals())
--- a/types.go
+++ b/types.go
@ -228,6 +228,7 @@ type SignalData struct {
 	NounDeterminers []string                      // Words that precede nouns: "the", "a", "this", "my", ...
 	VerbAuxiliaries []string                      // Auxiliaries/modals before verbs: "is", "was", "will", ...
 	VerbInfinitive  []string                      // Infinitive markers: "to"
+	VerbNegation    []string                      // Negation cues that weakly signal a verb: "not", "never", ...
 	Priors          map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words.
 }