2026-04-01 07:25:11 +00:00
5 changed files with 119 additions and 3 deletions
--- a/loader.go
+++ b/loader.go
@ -189,6 +189,12 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
 						}
 					}
 				}
+				if priors, ok := v["prior"].(map[string]any); ok {
+					loadSignalPriors(grammar, priors)
+				}
+				if priors, ok := v["priors"].(map[string]any); ok {
+					loadSignalPriors(grammar, priors)
+				}
 				continue
 			}

@ -299,3 +305,27 @@ func isPluralObject(m map[string]any) bool {
 	}
 	return true
 }
+
+func loadSignalPriors(grammar *GrammarData, priors map[string]any) {
+	if grammar == nil || len(priors) == 0 {
+		return
+	}
+	if grammar.Signals.Priors == nil {
+		grammar.Signals.Priors = make(map[string]map[string]float64, len(priors))
+	}
+	for word, raw := range priors {
+		bucket, ok := raw.(map[string]any)
+		if !ok || len(bucket) == 0 {
+			continue
+		}
+		key := core.Lower(word)
+		if grammar.Signals.Priors[key] == nil {
+			grammar.Signals.Priors[key] = make(map[string]float64, len(bucket))
+		}
+		for role, value := range bucket {
+			if score := toFloat64(value); score != 0 {
+				grammar.Signals.Priors[key][core.Lower(role)] = score
+			}
+		}
+	}
+}
--- a/loader_test.go
+++ b/loader_test.go
@ -156,6 +156,14 @@ func TestFlattenWithGrammar(t *testing.T) {
 				"decimal":   ".",
 				"percent":   "%s%%",
 			},
+			"signal": map[string]any{
+				"prior": map[string]any{
+					"commit": map[string]any{
+						"verb": 0.25,
+						"noun": 0.75,
+					},
+				},
+			},
 			"article": map[string]any{
 				"indefinite": map[string]any{
 					"default": "a",
@ -480,6 +488,11 @@ func TestCustomFSLoader(t *testing.T) {
 						"draft": { "base": "draft", "past": "drafted", "gerund": "drafting" },
 						"zap": { "base": "zap", "past": "zapped", "gerund": "zapping" }
 					},
+					"signal": {
+						"priors": {
+							"draft": { "verb": 0.6, "noun": 0.4 }
+						}
+					},
 					"word": {
 						"hello": "Hello"
 					}
@ -510,4 +523,7 @@ func TestCustomFSLoader(t *testing.T) {
 	if v, ok := gd.Verbs["draft"]; !ok || v.Past != "drafted" {
 		t.Errorf("verb base override 'draft' not loaded correctly")
 	}
+	if gd.Signals.Priors["draft"]["verb"] != 0.6 || gd.Signals.Priors["draft"]["noun"] != 0.4 {
+		t.Errorf("signal priors not loaded correctly: %+v", gd.Signals.Priors["draft"])
+	}
 }
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -942,8 +942,23 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 		}
 	}

-	// 7. default_prior: always fires as verb signal
-	if w, ok := t.weights["default_prior"]; ok {
+	// 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
+	if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
+		verbScore += priorVerb
+		nounScore += priorNoun
+		if t.withSignals {
+			components = append(components, SignalComponent{
+				Name: "default_prior", Weight: 1.0, Value: priorVerb, Contrib: priorVerb,
+				Reason: "corpus-derived prior",
+			})
+			if priorNoun > 0 {
+				components = append(components, SignalComponent{
+					Name: "default_prior", Weight: 1.0, Value: priorNoun, Contrib: priorNoun,
+					Reason: "corpus-derived prior",
+				})
+			}
+		}
+	} else if w, ok := t.weights["default_prior"]; ok {
 		verbScore += w * 1.0
 		if t.withSignals {
 			components = append(components, SignalComponent{
@ -956,6 +971,24 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
 	return verbScore, nounScore, components
 }

+func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
+	data := i18n.GetGrammarData(t.lang)
+	if data == nil || len(data.Signals.Priors) == 0 {
+		return 0, 0, false
+	}
+	bucket, ok := data.Signals.Priors[core.Lower(word)]
+	if !ok || len(bucket) == 0 {
+		return 0, 0, false
+	}
+	verb := bucket["verb"]
+	noun := bucket["noun"]
+	total := verb + noun
+	if total <= 0 {
+		return 0, 0, false
+	}
+	return verb / total, noun / total, true
+}
+
 // hasConfidentVerbInClause scans for a confident verb (Confidence >= 1.0)
 // within the same clause as the token at idx. Clause boundaries are
 // punctuation tokens and clause-boundary conjunctions/subordinators (D2).
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -491,6 +491,43 @@ func TestTokeniser_WithSignals(t *testing.T) {
 	_ = tok // verify it compiles and accepts the option
 }

+func TestTokeniser_Tokenise_CorpusPriorBias(t *testing.T) {
+	const lang = "zz-prior"
+	original := i18n.GetGrammarData(lang)
+	t.Cleanup(func() {
+		i18n.SetGrammarData(lang, original)
+	})
+
+	i18n.SetGrammarData(lang, &i18n.GrammarData{
+		Verbs: map[string]i18n.VerbForms{
+			"commit": {Past: "committed", Gerund: "committing"},
+		},
+		Nouns: map[string]i18n.NounForms{
+			"commit": {One: "commit", Other: "commits"},
+		},
+		Signals: i18n.SignalData{
+			Priors: map[string]map[string]float64{
+				"commit": {
+					"verb": 0.2,
+					"noun": 0.8,
+				},
+			},
+		},
+	})
+
+	tok := NewTokeniserForLang(lang)
+	tokens := tok.Tokenise("please commit")
+	if len(tokens) != 2 {
+		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "please commit", len(tokens))
+	}
+	if tokens[1].Type != TokenNoun {
+		t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenNoun", "please commit", tokens[1].Type)
+	}
+	if tokens[1].Confidence <= 0.5 {
+		t.Fatalf("Tokenise(%q)[1].Confidence = %f, want > 0.5", "please commit", tokens[1].Confidence)
+	}
+}
+
 func TestTokeniser_DualClassDetection(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser()
--- a/types.go
+++ b/types.go
@ -228,7 +228,7 @@ type SignalData struct {
 	NounDeterminers []string                      // Words that precede nouns: "the", "a", "this", "my", ...
 	VerbAuxiliaries []string                      // Auxiliaries/modals before verbs: "is", "was", "will", ...
 	VerbInfinitive  []string                      // Infinitive markers: "to"
-	Priors          map[string]map[string]float64 // Reserved for Phase 2: corpus-derived per-word priors. Not yet loaded.
+	Priors          map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words.
 }

 // --- Number Formatting ---