diff --git a/go.sum b/go.sum index 46b541d..bc16ba9 100644 --- a/go.sum +++ b/go.sum @@ -15,14 +15,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= -github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= -golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= -golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/grammar.go b/grammar.go index 8fbe230..889a15d 100644 --- a/grammar.go +++ b/grammar.go @@ -106,6 +106,9 @@ func mergeSignalData(dst *SignalData, src SignalData) { if len(src.VerbInfinitive) > 0 { dst.VerbInfinitive = append(dst.VerbInfinitive, src.VerbInfinitive...) } + if len(src.VerbNegation) > 0 { + dst.VerbNegation = append(dst.VerbNegation, src.VerbNegation...) + } if len(src.Priors) == 0 { return } @@ -139,6 +142,7 @@ func grammarDataHasContent(data *GrammarData) bool { if len(data.Signals.NounDeterminers) > 0 || len(data.Signals.VerbAuxiliaries) > 0 || len(data.Signals.VerbInfinitive) > 0 || + len(data.Signals.VerbNegation) > 0 || len(data.Signals.Priors) > 0 { return true } diff --git a/loader.go b/loader.go index fc6336d..b6f0f29 100644 --- a/loader.go +++ b/loader.go @@ -192,6 +192,15 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa } } } + if vn, ok := v["verb_negation"]; ok { + if arr, ok := vn.([]any); ok { + for _, item := range arr { + if s, ok := item.(string); ok { + grammar.Signals.VerbNegation = append(grammar.Signals.VerbNegation, core.Lower(s)) + } + } + } + } if priors, ok := v["prior"].(map[string]any); ok { loadSignalPriors(grammar, priors) } diff --git a/loader_test.go b/loader_test.go index c2d3d86..be4a570 100644 --- a/loader_test.go +++ b/loader_test.go @@ -169,6 +169,7 @@ func TestFlattenWithGrammar(t *testing.T) { "noun": 0.75, }, }, + "verb_negation": []any{"not", "never"}, }, "article": map[string]any{ "indefinite": map[string]any{ @@ -243,6 +244,9 @@ func TestFlattenWithGrammar(t *testing.T) { if grammar.Number.ThousandsSep != "," { t.Errorf("number.thousands = %q, want ','", grammar.Number.ThousandsSep) } + if len(grammar.Signals.VerbNegation) != 2 || grammar.Signals.VerbNegation[0] != "not" || grammar.Signals.VerbNegation[1] != "never" { + t.Errorf("verb negation not extracted: %+v", grammar.Signals.VerbNegation) + } // Articles extracted if grammar.Articles.IndefiniteDefault != "a" { @@ -291,6 +295,7 @@ func TestMergeGrammarData(t *testing.T) { NounDeterminers: []string{"the"}, VerbAuxiliaries: []string{"will"}, VerbInfinitive: []string{"to"}, + VerbNegation: []string{"not"}, Priors: map[string]map[string]float64{ "run": { "verb": 0.7, @@ -326,6 +331,7 @@ func TestMergeGrammarData(t *testing.T) { NounDeterminers: []string{"a"}, VerbAuxiliaries: []string{"can"}, VerbInfinitive: []string{"go"}, + VerbNegation: []string{"never"}, Priors: map[string]map[string]float64{ "run": { "noun": 0.3, @@ -365,7 +371,7 @@ func TestMergeGrammarData(t *testing.T) { if data.Punct.LabelSuffix != " !" || data.Punct.ProgressSuffix != "..." { t.Errorf("punctuation not merged correctly: %+v", data.Punct) } - if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 { + if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 || len(data.Signals.VerbNegation) != 2 { t.Errorf("signal slices not merged correctly: %+v", data.Signals) } if got := data.Signals.Priors["run"]["verb"]; got != 0.7 { @@ -374,6 +380,9 @@ func TestMergeGrammarData(t *testing.T) { if got := data.Signals.Priors["run"]["noun"]; got != 0.3 { t.Errorf("signal priors missing merged value: got %v", got) } + if data.Signals.VerbNegation[0] != "not" || data.Signals.VerbNegation[1] != "never" { + t.Errorf("signal negation not merged correctly: %+v", data.Signals.VerbNegation) + } if data.Number.ThousandsSep != "." || data.Number.DecimalSep != "." || data.Number.PercentFmt != "%s%%" { t.Errorf("number format not merged correctly: %+v", data.Number) } @@ -393,7 +402,8 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) { "signal": { "noun_determiner": ["el"], "verb_auxiliary": ["va"], - "verb_infinitive": ["a"] + "verb_infinitive": ["a"], + "verb_negation": ["no", "nunca"] }, "number": { "thousands": ".", "decimal": ",", "percent": "%s %%"} } @@ -419,6 +429,9 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) { if len(data.Signals.NounDeterminers) != 1 || data.Signals.NounDeterminers[0] != "el" { t.Errorf("signals not loaded: %+v", data.Signals) } + if len(data.Signals.VerbNegation) != 2 || data.Signals.VerbNegation[0] != "no" || data.Signals.VerbNegation[1] != "nunca" { + t.Errorf("negation signal not loaded: %+v", data.Signals.VerbNegation) + } if data.Number.DecimalSep != "," || data.Number.ThousandsSep != "." { t.Errorf("number format not loaded: %+v", data.Number) } diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 12308eb..13e2729 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -99,6 +99,7 @@ type Tokeniser struct { nounDet map[string]bool // signal: noun determiners verbAux map[string]bool // signal: verb auxiliaries verbInf map[string]bool // signal: infinitive markers + verbNeg map[string]bool // signal: negation cues withSignals bool // allocate SignalBreakdown on ambiguous tokens weights map[string]float64 // signal weights (F3: configurable) } @@ -112,7 +113,7 @@ func WithSignals() TokeniserOption { } // WithWeights overrides the default signal weights for disambiguation. -// All 7 signal keys must be present; omitted keys silently disable those signals. +// All signal keys must be present; omitted keys silently disable those signals. func WithWeights(w map[string]float64) TokeniserOption { return func(t *Tokeniser) { t.weights = w } } @@ -521,6 +522,7 @@ func (t *Tokeniser) buildSignalIndex() { t.nounDet = make(map[string]bool) t.verbAux = make(map[string]bool) t.verbInf = make(map[string]bool) + t.verbNeg = make(map[string]bool) data := i18n.GetGrammarData(t.lang) @@ -558,6 +560,18 @@ func (t *Tokeniser) buildSignalIndex() { } else { t.verbInf["to"] = true } + + if data != nil && len(data.Signals.VerbNegation) > 0 { + for _, w := range data.Signals.VerbNegation { + t.verbNeg[core.Lower(w)] = true + } + } else { + // Keep the fallback conservative: these are weak cues, not hard + // negation parsing. + for _, w := range []string{"not", "never"} { + t.verbNeg[w] = true + } + } } func defaultVerbAuxiliaries() []string { @@ -577,6 +591,7 @@ func defaultWeights() map[string]float64 { return map[string]float64{ "noun_determiner": 0.35, "verb_auxiliary": 0.25, + "verb_negation": 0.05, "following_class": 0.15, "sentence_position": 0.10, "verb_saturation": 0.10, @@ -976,7 +991,7 @@ func (t *Tokeniser) resolveAmbiguous(tokens []Token) { } } -// scoreAmbiguous evaluates 7 weighted signals to determine whether an +// scoreAmbiguous evaluates 8 weighted signals to determine whether an // ambiguous token should be classified as verb or noun. func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, []SignalComponent) { var verbScore, nounScore float64 @@ -1010,7 +1025,25 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ } } - // 3. following_class: next token's class informs this token's role + // 3. verb_negation: preceding negation weakly signals a verb + if w, ok := t.weights["verb_negation"]; ok && idx > 0 { + prev := tokens[idx-1] + if t.verbNeg[prev.Lower] || t.hasNoLongerBefore(tokens, idx) { + verbScore += w * 1.0 + if t.withSignals { + reason := "preceded by '" + prev.Lower + "'" + if t.hasNoLongerBefore(tokens, idx) { + reason = "preceded by 'no longer'" + } + components = append(components, SignalComponent{ + Name: "verb_negation", Weight: w, Value: 1.0, Contrib: w, + Reason: reason, + }) + } + } + } + + // 4. following_class: next token's class informs this token's role if w, ok := t.weights["following_class"]; ok && idx+1 < len(tokens) { next := tokens[idx+1] if next.Type != tokenAmbiguous { @@ -1036,7 +1069,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ } } - // 4. sentence_position: first token in sentence → verb signal (imperative) + // 5. sentence_position: first token in sentence → verb signal (imperative) if w, ok := t.weights["sentence_position"]; ok && idx == 0 { verbScore += w * 1.0 if t.withSignals { @@ -1047,7 +1080,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ } } - // 5. verb_saturation: if a confident verb already exists in the same clause + // 6. verb_saturation: if a confident verb already exists in the same clause if w, ok := t.weights["verb_saturation"]; ok { if t.hasConfidentVerbInClause(tokens, idx) { nounScore += w * 1.0 @@ -1060,7 +1093,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ } } - // 6. inflection_echo: another token shares the same base in inflected form + // 7. inflection_echo: another token shares the same base in inflected form if w, ok := t.weights["inflection_echo"]; ok { echoVerb, echoNoun := t.checkInflectionEcho(tokens, idx) if echoNoun { @@ -1085,7 +1118,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ } } - // 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior. + // 8. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior. if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok { verbScore += priorVerb nounScore += priorNoun @@ -1114,6 +1147,13 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ return verbScore, nounScore, components } +func (t *Tokeniser) hasNoLongerBefore(tokens []Token, idx int) bool { + if idx < 2 { + return false + } + return tokens[idx-2].Lower == "no" && tokens[idx-1].Lower == "longer" +} + func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) { data := i18n.GetGrammarData(t.lang) if data == nil || len(data.Signals.Priors) == 0 { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index a47d2a3..39afe22 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -764,6 +764,34 @@ func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) { } } +func TestTokeniser_Disambiguate_NegationSignal(t *testing.T) { + setup(t) + tok := NewTokeniser(WithSignals()) + + tokens := tok.Tokenise("no longer commit the changes") + if len(tokens) < 3 { + t.Fatalf("Tokenise(%q) returned %d tokens, want at least 3", "no longer commit the changes", len(tokens)) + } + + commitTok := tokens[2] + if commitTok.Type != TokenVerb { + t.Fatalf("'commit' after 'no longer': Type = %v, want TokenVerb", commitTok.Type) + } + if commitTok.Signals == nil { + t.Fatal("'commit' after 'no longer' should have signal breakdown") + } + foundNegation := false + for _, component := range commitTok.Signals.Components { + if component.Name == "verb_negation" { + foundNegation = true + break + } + } + if !foundNegation { + t.Error("verb_negation signal should have fired for 'no longer commit'") + } +} + func TestTokeniser_WithSignals_Breakdown(t *testing.T) { setup(t) tok := NewTokeniser(WithSignals()) diff --git a/types.go b/types.go index dffcbec..fd627e7 100644 --- a/types.go +++ b/types.go @@ -228,6 +228,7 @@ type SignalData struct { NounDeterminers []string // Words that precede nouns: "the", "a", "this", "my", ... VerbAuxiliaries []string // Auxiliaries/modals before verbs: "is", "was", "will", ... VerbInfinitive []string // Infinitive markers: "to" + VerbNegation []string // Negation cues that weakly signal a verb: "not", "never", ... Priors map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words. }