diff --git a/loader.go b/loader.go index 5260909..96cf45f 100644 --- a/loader.go +++ b/loader.go @@ -189,6 +189,12 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa } } } + if priors, ok := v["prior"].(map[string]any); ok { + loadSignalPriors(grammar, priors) + } + if priors, ok := v["priors"].(map[string]any); ok { + loadSignalPriors(grammar, priors) + } continue } @@ -299,3 +305,27 @@ func isPluralObject(m map[string]any) bool { } return true } + +func loadSignalPriors(grammar *GrammarData, priors map[string]any) { + if grammar == nil || len(priors) == 0 { + return + } + if grammar.Signals.Priors == nil { + grammar.Signals.Priors = make(map[string]map[string]float64, len(priors)) + } + for word, raw := range priors { + bucket, ok := raw.(map[string]any) + if !ok || len(bucket) == 0 { + continue + } + key := core.Lower(word) + if grammar.Signals.Priors[key] == nil { + grammar.Signals.Priors[key] = make(map[string]float64, len(bucket)) + } + for role, value := range bucket { + if score := toFloat64(value); score != 0 { + grammar.Signals.Priors[key][core.Lower(role)] = score + } + } + } +} diff --git a/loader_test.go b/loader_test.go index 6889205..2cfa9bc 100644 --- a/loader_test.go +++ b/loader_test.go @@ -156,6 +156,14 @@ func TestFlattenWithGrammar(t *testing.T) { "decimal": ".", "percent": "%s%%", }, + "signal": map[string]any{ + "prior": map[string]any{ + "commit": map[string]any{ + "verb": 0.25, + "noun": 0.75, + }, + }, + }, "article": map[string]any{ "indefinite": map[string]any{ "default": "a", @@ -480,6 +488,11 @@ func TestCustomFSLoader(t *testing.T) { "draft": { "base": "draft", "past": "drafted", "gerund": "drafting" }, "zap": { "base": "zap", "past": "zapped", "gerund": "zapping" } }, + "signal": { + "priors": { + "draft": { "verb": 0.6, "noun": 0.4 } + } + }, "word": { "hello": "Hello" } @@ -510,4 +523,7 @@ func TestCustomFSLoader(t *testing.T) { if v, ok := gd.Verbs["draft"]; !ok || v.Past != "drafted" { t.Errorf("verb base override 'draft' not loaded correctly") } + if gd.Signals.Priors["draft"]["verb"] != 0.6 || gd.Signals.Priors["draft"]["noun"] != 0.4 { + t.Errorf("signal priors not loaded correctly: %+v", gd.Signals.Priors["draft"]) + } } diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 802d3fc..0f4e39f 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -942,8 +942,23 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ } } - // 7. default_prior: always fires as verb signal - if w, ok := t.weights["default_prior"]; ok { + // 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior. + if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok { + verbScore += priorVerb + nounScore += priorNoun + if t.withSignals { + components = append(components, SignalComponent{ + Name: "default_prior", Weight: 1.0, Value: priorVerb, Contrib: priorVerb, + Reason: "corpus-derived prior", + }) + if priorNoun > 0 { + components = append(components, SignalComponent{ + Name: "default_prior", Weight: 1.0, Value: priorNoun, Contrib: priorNoun, + Reason: "corpus-derived prior", + }) + } + } + } else if w, ok := t.weights["default_prior"]; ok { verbScore += w * 1.0 if t.withSignals { components = append(components, SignalComponent{ @@ -956,6 +971,24 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [ return verbScore, nounScore, components } +func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) { + data := i18n.GetGrammarData(t.lang) + if data == nil || len(data.Signals.Priors) == 0 { + return 0, 0, false + } + bucket, ok := data.Signals.Priors[core.Lower(word)] + if !ok || len(bucket) == 0 { + return 0, 0, false + } + verb := bucket["verb"] + noun := bucket["noun"] + total := verb + noun + if total <= 0 { + return 0, 0, false + } + return verb / total, noun / total, true +} + // hasConfidentVerbInClause scans for a confident verb (Confidence >= 1.0) // within the same clause as the token at idx. Clause boundaries are // punctuation tokens and clause-boundary conjunctions/subordinators (D2). diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 7b75bab..09cfb99 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -491,6 +491,43 @@ func TestTokeniser_WithSignals(t *testing.T) { _ = tok // verify it compiles and accepts the option } +func TestTokeniser_Tokenise_CorpusPriorBias(t *testing.T) { + const lang = "zz-prior" + original := i18n.GetGrammarData(lang) + t.Cleanup(func() { + i18n.SetGrammarData(lang, original) + }) + + i18n.SetGrammarData(lang, &i18n.GrammarData{ + Verbs: map[string]i18n.VerbForms{ + "commit": {Past: "committed", Gerund: "committing"}, + }, + Nouns: map[string]i18n.NounForms{ + "commit": {One: "commit", Other: "commits"}, + }, + Signals: i18n.SignalData{ + Priors: map[string]map[string]float64{ + "commit": { + "verb": 0.2, + "noun": 0.8, + }, + }, + }, + }) + + tok := NewTokeniserForLang(lang) + tokens := tok.Tokenise("please commit") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "please commit", len(tokens)) + } + if tokens[1].Type != TokenNoun { + t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenNoun", "please commit", tokens[1].Type) + } + if tokens[1].Confidence <= 0.5 { + t.Fatalf("Tokenise(%q)[1].Confidence = %f, want > 0.5", "please commit", tokens[1].Confidence) + } +} + func TestTokeniser_DualClassDetection(t *testing.T) { setup(t) tok := NewTokeniser() diff --git a/types.go b/types.go index dbe1c66..422f483 100644 --- a/types.go +++ b/types.go @@ -228,7 +228,7 @@ type SignalData struct { NounDeterminers []string // Words that precede nouns: "the", "a", "this", "my", ... VerbAuxiliaries []string // Auxiliaries/modals before verbs: "is", "was", "will", ... VerbInfinitive []string // Infinitive markers: "to" - Priors map[string]map[string]float64 // Reserved for Phase 2: corpus-derived per-word priors. Not yet loaded. + Priors map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words. } // --- Number Formatting ---