feat(reversal): add negation disambiguation signal
All checks were successful
Security Scan / security (push) Successful in 14s
Test / test (push) Successful in 1m30s

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Virgil 2026-04-02 00:53:16 +00:00
parent 7c502f3da0
commit 1e3b86ffdf
7 changed files with 104 additions and 13 deletions

4
go.sum
View file

@ -15,14 +15,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=

View file

@ -106,6 +106,9 @@ func mergeSignalData(dst *SignalData, src SignalData) {
if len(src.VerbInfinitive) > 0 {
dst.VerbInfinitive = append(dst.VerbInfinitive, src.VerbInfinitive...)
}
if len(src.VerbNegation) > 0 {
dst.VerbNegation = append(dst.VerbNegation, src.VerbNegation...)
}
if len(src.Priors) == 0 {
return
}
@ -139,6 +142,7 @@ func grammarDataHasContent(data *GrammarData) bool {
if len(data.Signals.NounDeterminers) > 0 ||
len(data.Signals.VerbAuxiliaries) > 0 ||
len(data.Signals.VerbInfinitive) > 0 ||
len(data.Signals.VerbNegation) > 0 ||
len(data.Signals.Priors) > 0 {
return true
}

View file

@ -192,6 +192,15 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
}
}
}
if vn, ok := v["verb_negation"]; ok {
if arr, ok := vn.([]any); ok {
for _, item := range arr {
if s, ok := item.(string); ok {
grammar.Signals.VerbNegation = append(grammar.Signals.VerbNegation, core.Lower(s))
}
}
}
}
if priors, ok := v["prior"].(map[string]any); ok {
loadSignalPriors(grammar, priors)
}

View file

@ -169,6 +169,7 @@ func TestFlattenWithGrammar(t *testing.T) {
"noun": 0.75,
},
},
"verb_negation": []any{"not", "never"},
},
"article": map[string]any{
"indefinite": map[string]any{
@ -243,6 +244,9 @@ func TestFlattenWithGrammar(t *testing.T) {
if grammar.Number.ThousandsSep != "," {
t.Errorf("number.thousands = %q, want ','", grammar.Number.ThousandsSep)
}
if len(grammar.Signals.VerbNegation) != 2 || grammar.Signals.VerbNegation[0] != "not" || grammar.Signals.VerbNegation[1] != "never" {
t.Errorf("verb negation not extracted: %+v", grammar.Signals.VerbNegation)
}
// Articles extracted
if grammar.Articles.IndefiniteDefault != "a" {
@ -291,6 +295,7 @@ func TestMergeGrammarData(t *testing.T) {
NounDeterminers: []string{"the"},
VerbAuxiliaries: []string{"will"},
VerbInfinitive: []string{"to"},
VerbNegation: []string{"not"},
Priors: map[string]map[string]float64{
"run": {
"verb": 0.7,
@ -326,6 +331,7 @@ func TestMergeGrammarData(t *testing.T) {
NounDeterminers: []string{"a"},
VerbAuxiliaries: []string{"can"},
VerbInfinitive: []string{"go"},
VerbNegation: []string{"never"},
Priors: map[string]map[string]float64{
"run": {
"noun": 0.3,
@ -365,7 +371,7 @@ func TestMergeGrammarData(t *testing.T) {
if data.Punct.LabelSuffix != " !" || data.Punct.ProgressSuffix != "..." {
t.Errorf("punctuation not merged correctly: %+v", data.Punct)
}
if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 {
if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 || len(data.Signals.VerbNegation) != 2 {
t.Errorf("signal slices not merged correctly: %+v", data.Signals)
}
if got := data.Signals.Priors["run"]["verb"]; got != 0.7 {
@ -374,6 +380,9 @@ func TestMergeGrammarData(t *testing.T) {
if got := data.Signals.Priors["run"]["noun"]; got != 0.3 {
t.Errorf("signal priors missing merged value: got %v", got)
}
if data.Signals.VerbNegation[0] != "not" || data.Signals.VerbNegation[1] != "never" {
t.Errorf("signal negation not merged correctly: %+v", data.Signals.VerbNegation)
}
if data.Number.ThousandsSep != "." || data.Number.DecimalSep != "." || data.Number.PercentFmt != "%s%%" {
t.Errorf("number format not merged correctly: %+v", data.Number)
}
@ -393,7 +402,8 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) {
"signal": {
"noun_determiner": ["el"],
"verb_auxiliary": ["va"],
"verb_infinitive": ["a"]
"verb_infinitive": ["a"],
"verb_negation": ["no", "nunca"]
},
"number": { "thousands": ".", "decimal": ",", "percent": "%s %%"}
}
@ -419,6 +429,9 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) {
if len(data.Signals.NounDeterminers) != 1 || data.Signals.NounDeterminers[0] != "el" {
t.Errorf("signals not loaded: %+v", data.Signals)
}
if len(data.Signals.VerbNegation) != 2 || data.Signals.VerbNegation[0] != "no" || data.Signals.VerbNegation[1] != "nunca" {
t.Errorf("negation signal not loaded: %+v", data.Signals.VerbNegation)
}
if data.Number.DecimalSep != "," || data.Number.ThousandsSep != "." {
t.Errorf("number format not loaded: %+v", data.Number)
}

View file

@ -99,6 +99,7 @@ type Tokeniser struct {
nounDet map[string]bool // signal: noun determiners
verbAux map[string]bool // signal: verb auxiliaries
verbInf map[string]bool // signal: infinitive markers
verbNeg map[string]bool // signal: negation cues
withSignals bool // allocate SignalBreakdown on ambiguous tokens
weights map[string]float64 // signal weights (F3: configurable)
}
@ -112,7 +113,7 @@ func WithSignals() TokeniserOption {
}
// WithWeights overrides the default signal weights for disambiguation.
// All 7 signal keys must be present; omitted keys silently disable those signals.
// All signal keys must be present; omitted keys silently disable those signals.
func WithWeights(w map[string]float64) TokeniserOption {
return func(t *Tokeniser) { t.weights = w }
}
@ -521,6 +522,7 @@ func (t *Tokeniser) buildSignalIndex() {
t.nounDet = make(map[string]bool)
t.verbAux = make(map[string]bool)
t.verbInf = make(map[string]bool)
t.verbNeg = make(map[string]bool)
data := i18n.GetGrammarData(t.lang)
@ -558,6 +560,18 @@ func (t *Tokeniser) buildSignalIndex() {
} else {
t.verbInf["to"] = true
}
if data != nil && len(data.Signals.VerbNegation) > 0 {
for _, w := range data.Signals.VerbNegation {
t.verbNeg[core.Lower(w)] = true
}
} else {
// Keep the fallback conservative: these are weak cues, not hard
// negation parsing.
for _, w := range []string{"not", "never"} {
t.verbNeg[w] = true
}
}
}
func defaultVerbAuxiliaries() []string {
@ -577,6 +591,7 @@ func defaultWeights() map[string]float64 {
return map[string]float64{
"noun_determiner": 0.35,
"verb_auxiliary": 0.25,
"verb_negation": 0.05,
"following_class": 0.15,
"sentence_position": 0.10,
"verb_saturation": 0.10,
@ -976,7 +991,7 @@ func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
}
}
// scoreAmbiguous evaluates 7 weighted signals to determine whether an
// scoreAmbiguous evaluates 8 weighted signals to determine whether an
// ambiguous token should be classified as verb or noun.
func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, []SignalComponent) {
var verbScore, nounScore float64
@ -1010,7 +1025,25 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
}
}
// 3. following_class: next token's class informs this token's role
// 3. verb_negation: preceding negation weakly signals a verb
if w, ok := t.weights["verb_negation"]; ok && idx > 0 {
prev := tokens[idx-1]
if t.verbNeg[prev.Lower] || t.hasNoLongerBefore(tokens, idx) {
verbScore += w * 1.0
if t.withSignals {
reason := "preceded by '" + prev.Lower + "'"
if t.hasNoLongerBefore(tokens, idx) {
reason = "preceded by 'no longer'"
}
components = append(components, SignalComponent{
Name: "verb_negation", Weight: w, Value: 1.0, Contrib: w,
Reason: reason,
})
}
}
}
// 4. following_class: next token's class informs this token's role
if w, ok := t.weights["following_class"]; ok && idx+1 < len(tokens) {
next := tokens[idx+1]
if next.Type != tokenAmbiguous {
@ -1036,7 +1069,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
}
}
// 4. sentence_position: first token in sentence → verb signal (imperative)
// 5. sentence_position: first token in sentence → verb signal (imperative)
if w, ok := t.weights["sentence_position"]; ok && idx == 0 {
verbScore += w * 1.0
if t.withSignals {
@ -1047,7 +1080,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
}
}
// 5. verb_saturation: if a confident verb already exists in the same clause
// 6. verb_saturation: if a confident verb already exists in the same clause
if w, ok := t.weights["verb_saturation"]; ok {
if t.hasConfidentVerbInClause(tokens, idx) {
nounScore += w * 1.0
@ -1060,7 +1093,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
}
}
// 6. inflection_echo: another token shares the same base in inflected form
// 7. inflection_echo: another token shares the same base in inflected form
if w, ok := t.weights["inflection_echo"]; ok {
echoVerb, echoNoun := t.checkInflectionEcho(tokens, idx)
if echoNoun {
@ -1085,7 +1118,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
}
}
// 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
// 8. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
verbScore += priorVerb
nounScore += priorNoun
@ -1114,6 +1147,13 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
return verbScore, nounScore, components
}
func (t *Tokeniser) hasNoLongerBefore(tokens []Token, idx int) bool {
if idx < 2 {
return false
}
return tokens[idx-2].Lower == "no" && tokens[idx-1].Lower == "longer"
}
func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
data := i18n.GetGrammarData(t.lang)
if data == nil || len(data.Signals.Priors) == 0 {

View file

@ -764,6 +764,34 @@ func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) {
}
}
func TestTokeniser_Disambiguate_NegationSignal(t *testing.T) {
setup(t)
tok := NewTokeniser(WithSignals())
tokens := tok.Tokenise("no longer commit the changes")
if len(tokens) < 3 {
t.Fatalf("Tokenise(%q) returned %d tokens, want at least 3", "no longer commit the changes", len(tokens))
}
commitTok := tokens[2]
if commitTok.Type != TokenVerb {
t.Fatalf("'commit' after 'no longer': Type = %v, want TokenVerb", commitTok.Type)
}
if commitTok.Signals == nil {
t.Fatal("'commit' after 'no longer' should have signal breakdown")
}
foundNegation := false
for _, component := range commitTok.Signals.Components {
if component.Name == "verb_negation" {
foundNegation = true
break
}
}
if !foundNegation {
t.Error("verb_negation signal should have fired for 'no longer commit'")
}
}
func TestTokeniser_WithSignals_Breakdown(t *testing.T) {
setup(t)
tok := NewTokeniser(WithSignals())

View file

@ -228,6 +228,7 @@ type SignalData struct {
NounDeterminers []string // Words that precede nouns: "the", "a", "this", "my", ...
VerbAuxiliaries []string // Auxiliaries/modals before verbs: "is", "was", "will", ...
VerbInfinitive []string // Infinitive markers: "to"
VerbNegation []string // Negation cues that weakly signal a verb: "not", "never", ...
Priors map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words.
}