feat(reversal): add negation disambiguation signal
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
7c502f3da0
commit
1e3b86ffdf
7 changed files with 104 additions and 13 deletions
4
go.sum
4
go.sum
|
|
@ -15,14 +15,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
|
|||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
|
||||
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
|
||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
|
||||
golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
|
|
|
|||
|
|
@ -106,6 +106,9 @@ func mergeSignalData(dst *SignalData, src SignalData) {
|
|||
if len(src.VerbInfinitive) > 0 {
|
||||
dst.VerbInfinitive = append(dst.VerbInfinitive, src.VerbInfinitive...)
|
||||
}
|
||||
if len(src.VerbNegation) > 0 {
|
||||
dst.VerbNegation = append(dst.VerbNegation, src.VerbNegation...)
|
||||
}
|
||||
if len(src.Priors) == 0 {
|
||||
return
|
||||
}
|
||||
|
|
@ -139,6 +142,7 @@ func grammarDataHasContent(data *GrammarData) bool {
|
|||
if len(data.Signals.NounDeterminers) > 0 ||
|
||||
len(data.Signals.VerbAuxiliaries) > 0 ||
|
||||
len(data.Signals.VerbInfinitive) > 0 ||
|
||||
len(data.Signals.VerbNegation) > 0 ||
|
||||
len(data.Signals.Priors) > 0 {
|
||||
return true
|
||||
}
|
||||
|
|
|
|||
|
|
@ -192,6 +192,15 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
|
|||
}
|
||||
}
|
||||
}
|
||||
if vn, ok := v["verb_negation"]; ok {
|
||||
if arr, ok := vn.([]any); ok {
|
||||
for _, item := range arr {
|
||||
if s, ok := item.(string); ok {
|
||||
grammar.Signals.VerbNegation = append(grammar.Signals.VerbNegation, core.Lower(s))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if priors, ok := v["prior"].(map[string]any); ok {
|
||||
loadSignalPriors(grammar, priors)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -169,6 +169,7 @@ func TestFlattenWithGrammar(t *testing.T) {
|
|||
"noun": 0.75,
|
||||
},
|
||||
},
|
||||
"verb_negation": []any{"not", "never"},
|
||||
},
|
||||
"article": map[string]any{
|
||||
"indefinite": map[string]any{
|
||||
|
|
@ -243,6 +244,9 @@ func TestFlattenWithGrammar(t *testing.T) {
|
|||
if grammar.Number.ThousandsSep != "," {
|
||||
t.Errorf("number.thousands = %q, want ','", grammar.Number.ThousandsSep)
|
||||
}
|
||||
if len(grammar.Signals.VerbNegation) != 2 || grammar.Signals.VerbNegation[0] != "not" || grammar.Signals.VerbNegation[1] != "never" {
|
||||
t.Errorf("verb negation not extracted: %+v", grammar.Signals.VerbNegation)
|
||||
}
|
||||
|
||||
// Articles extracted
|
||||
if grammar.Articles.IndefiniteDefault != "a" {
|
||||
|
|
@ -291,6 +295,7 @@ func TestMergeGrammarData(t *testing.T) {
|
|||
NounDeterminers: []string{"the"},
|
||||
VerbAuxiliaries: []string{"will"},
|
||||
VerbInfinitive: []string{"to"},
|
||||
VerbNegation: []string{"not"},
|
||||
Priors: map[string]map[string]float64{
|
||||
"run": {
|
||||
"verb": 0.7,
|
||||
|
|
@ -326,6 +331,7 @@ func TestMergeGrammarData(t *testing.T) {
|
|||
NounDeterminers: []string{"a"},
|
||||
VerbAuxiliaries: []string{"can"},
|
||||
VerbInfinitive: []string{"go"},
|
||||
VerbNegation: []string{"never"},
|
||||
Priors: map[string]map[string]float64{
|
||||
"run": {
|
||||
"noun": 0.3,
|
||||
|
|
@ -365,7 +371,7 @@ func TestMergeGrammarData(t *testing.T) {
|
|||
if data.Punct.LabelSuffix != " !" || data.Punct.ProgressSuffix != "..." {
|
||||
t.Errorf("punctuation not merged correctly: %+v", data.Punct)
|
||||
}
|
||||
if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 {
|
||||
if len(data.Signals.NounDeterminers) != 2 || len(data.Signals.VerbAuxiliaries) != 2 || len(data.Signals.VerbInfinitive) != 2 || len(data.Signals.VerbNegation) != 2 {
|
||||
t.Errorf("signal slices not merged correctly: %+v", data.Signals)
|
||||
}
|
||||
if got := data.Signals.Priors["run"]["verb"]; got != 0.7 {
|
||||
|
|
@ -374,6 +380,9 @@ func TestMergeGrammarData(t *testing.T) {
|
|||
if got := data.Signals.Priors["run"]["noun"]; got != 0.3 {
|
||||
t.Errorf("signal priors missing merged value: got %v", got)
|
||||
}
|
||||
if data.Signals.VerbNegation[0] != "not" || data.Signals.VerbNegation[1] != "never" {
|
||||
t.Errorf("signal negation not merged correctly: %+v", data.Signals.VerbNegation)
|
||||
}
|
||||
if data.Number.ThousandsSep != "." || data.Number.DecimalSep != "." || data.Number.PercentFmt != "%s%%" {
|
||||
t.Errorf("number format not merged correctly: %+v", data.Number)
|
||||
}
|
||||
|
|
@ -393,7 +402,8 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) {
|
|||
"signal": {
|
||||
"noun_determiner": ["el"],
|
||||
"verb_auxiliary": ["va"],
|
||||
"verb_infinitive": ["a"]
|
||||
"verb_infinitive": ["a"],
|
||||
"verb_negation": ["no", "nunca"]
|
||||
},
|
||||
"number": { "thousands": ".", "decimal": ",", "percent": "%s %%"}
|
||||
}
|
||||
|
|
@ -419,6 +429,9 @@ func TestNewWithLoader_LoadsGrammarOnlyLocale(t *testing.T) {
|
|||
if len(data.Signals.NounDeterminers) != 1 || data.Signals.NounDeterminers[0] != "el" {
|
||||
t.Errorf("signals not loaded: %+v", data.Signals)
|
||||
}
|
||||
if len(data.Signals.VerbNegation) != 2 || data.Signals.VerbNegation[0] != "no" || data.Signals.VerbNegation[1] != "nunca" {
|
||||
t.Errorf("negation signal not loaded: %+v", data.Signals.VerbNegation)
|
||||
}
|
||||
if data.Number.DecimalSep != "," || data.Number.ThousandsSep != "." {
|
||||
t.Errorf("number format not loaded: %+v", data.Number)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -99,6 +99,7 @@ type Tokeniser struct {
|
|||
nounDet map[string]bool // signal: noun determiners
|
||||
verbAux map[string]bool // signal: verb auxiliaries
|
||||
verbInf map[string]bool // signal: infinitive markers
|
||||
verbNeg map[string]bool // signal: negation cues
|
||||
withSignals bool // allocate SignalBreakdown on ambiguous tokens
|
||||
weights map[string]float64 // signal weights (F3: configurable)
|
||||
}
|
||||
|
|
@ -112,7 +113,7 @@ func WithSignals() TokeniserOption {
|
|||
}
|
||||
|
||||
// WithWeights overrides the default signal weights for disambiguation.
|
||||
// All 7 signal keys must be present; omitted keys silently disable those signals.
|
||||
// All signal keys must be present; omitted keys silently disable those signals.
|
||||
func WithWeights(w map[string]float64) TokeniserOption {
|
||||
return func(t *Tokeniser) { t.weights = w }
|
||||
}
|
||||
|
|
@ -521,6 +522,7 @@ func (t *Tokeniser) buildSignalIndex() {
|
|||
t.nounDet = make(map[string]bool)
|
||||
t.verbAux = make(map[string]bool)
|
||||
t.verbInf = make(map[string]bool)
|
||||
t.verbNeg = make(map[string]bool)
|
||||
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
|
||||
|
|
@ -558,6 +560,18 @@ func (t *Tokeniser) buildSignalIndex() {
|
|||
} else {
|
||||
t.verbInf["to"] = true
|
||||
}
|
||||
|
||||
if data != nil && len(data.Signals.VerbNegation) > 0 {
|
||||
for _, w := range data.Signals.VerbNegation {
|
||||
t.verbNeg[core.Lower(w)] = true
|
||||
}
|
||||
} else {
|
||||
// Keep the fallback conservative: these are weak cues, not hard
|
||||
// negation parsing.
|
||||
for _, w := range []string{"not", "never"} {
|
||||
t.verbNeg[w] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func defaultVerbAuxiliaries() []string {
|
||||
|
|
@ -577,6 +591,7 @@ func defaultWeights() map[string]float64 {
|
|||
return map[string]float64{
|
||||
"noun_determiner": 0.35,
|
||||
"verb_auxiliary": 0.25,
|
||||
"verb_negation": 0.05,
|
||||
"following_class": 0.15,
|
||||
"sentence_position": 0.10,
|
||||
"verb_saturation": 0.10,
|
||||
|
|
@ -976,7 +991,7 @@ func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
|
|||
}
|
||||
}
|
||||
|
||||
// scoreAmbiguous evaluates 7 weighted signals to determine whether an
|
||||
// scoreAmbiguous evaluates 8 weighted signals to determine whether an
|
||||
// ambiguous token should be classified as verb or noun.
|
||||
func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, []SignalComponent) {
|
||||
var verbScore, nounScore float64
|
||||
|
|
@ -1010,7 +1025,25 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
}
|
||||
}
|
||||
|
||||
// 3. following_class: next token's class informs this token's role
|
||||
// 3. verb_negation: preceding negation weakly signals a verb
|
||||
if w, ok := t.weights["verb_negation"]; ok && idx > 0 {
|
||||
prev := tokens[idx-1]
|
||||
if t.verbNeg[prev.Lower] || t.hasNoLongerBefore(tokens, idx) {
|
||||
verbScore += w * 1.0
|
||||
if t.withSignals {
|
||||
reason := "preceded by '" + prev.Lower + "'"
|
||||
if t.hasNoLongerBefore(tokens, idx) {
|
||||
reason = "preceded by 'no longer'"
|
||||
}
|
||||
components = append(components, SignalComponent{
|
||||
Name: "verb_negation", Weight: w, Value: 1.0, Contrib: w,
|
||||
Reason: reason,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. following_class: next token's class informs this token's role
|
||||
if w, ok := t.weights["following_class"]; ok && idx+1 < len(tokens) {
|
||||
next := tokens[idx+1]
|
||||
if next.Type != tokenAmbiguous {
|
||||
|
|
@ -1036,7 +1069,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
}
|
||||
}
|
||||
|
||||
// 4. sentence_position: first token in sentence → verb signal (imperative)
|
||||
// 5. sentence_position: first token in sentence → verb signal (imperative)
|
||||
if w, ok := t.weights["sentence_position"]; ok && idx == 0 {
|
||||
verbScore += w * 1.0
|
||||
if t.withSignals {
|
||||
|
|
@ -1047,7 +1080,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
}
|
||||
}
|
||||
|
||||
// 5. verb_saturation: if a confident verb already exists in the same clause
|
||||
// 6. verb_saturation: if a confident verb already exists in the same clause
|
||||
if w, ok := t.weights["verb_saturation"]; ok {
|
||||
if t.hasConfidentVerbInClause(tokens, idx) {
|
||||
nounScore += w * 1.0
|
||||
|
|
@ -1060,7 +1093,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
}
|
||||
}
|
||||
|
||||
// 6. inflection_echo: another token shares the same base in inflected form
|
||||
// 7. inflection_echo: another token shares the same base in inflected form
|
||||
if w, ok := t.weights["inflection_echo"]; ok {
|
||||
echoVerb, echoNoun := t.checkInflectionEcho(tokens, idx)
|
||||
if echoNoun {
|
||||
|
|
@ -1085,7 +1118,7 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
}
|
||||
}
|
||||
|
||||
// 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
|
||||
// 8. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
|
||||
if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
|
||||
verbScore += priorVerb
|
||||
nounScore += priorNoun
|
||||
|
|
@ -1114,6 +1147,13 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
return verbScore, nounScore, components
|
||||
}
|
||||
|
||||
func (t *Tokeniser) hasNoLongerBefore(tokens []Token, idx int) bool {
|
||||
if idx < 2 {
|
||||
return false
|
||||
}
|
||||
return tokens[idx-2].Lower == "no" && tokens[idx-1].Lower == "longer"
|
||||
}
|
||||
|
||||
func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data == nil || len(data.Signals.Priors) == 0 {
|
||||
|
|
|
|||
|
|
@ -764,6 +764,34 @@ func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Disambiguate_NegationSignal(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser(WithSignals())
|
||||
|
||||
tokens := tok.Tokenise("no longer commit the changes")
|
||||
if len(tokens) < 3 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want at least 3", "no longer commit the changes", len(tokens))
|
||||
}
|
||||
|
||||
commitTok := tokens[2]
|
||||
if commitTok.Type != TokenVerb {
|
||||
t.Fatalf("'commit' after 'no longer': Type = %v, want TokenVerb", commitTok.Type)
|
||||
}
|
||||
if commitTok.Signals == nil {
|
||||
t.Fatal("'commit' after 'no longer' should have signal breakdown")
|
||||
}
|
||||
foundNegation := false
|
||||
for _, component := range commitTok.Signals.Components {
|
||||
if component.Name == "verb_negation" {
|
||||
foundNegation = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundNegation {
|
||||
t.Error("verb_negation signal should have fired for 'no longer commit'")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_WithSignals_Breakdown(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser(WithSignals())
|
||||
|
|
|
|||
1
types.go
1
types.go
|
|
@ -228,6 +228,7 @@ type SignalData struct {
|
|||
NounDeterminers []string // Words that precede nouns: "the", "a", "this", "my", ...
|
||||
VerbAuxiliaries []string // Auxiliaries/modals before verbs: "is", "was", "will", ...
|
||||
VerbInfinitive []string // Infinitive markers: "to"
|
||||
VerbNegation []string // Negation cues that weakly signal a verb: "not", "never", ...
|
||||
Priors map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words.
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue