[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #27

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-01 07:25:11 +00:00
5 changed files with 119 additions and 3 deletions

View file

@ -189,6 +189,12 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
}
}
}
if priors, ok := v["prior"].(map[string]any); ok {
loadSignalPriors(grammar, priors)
}
if priors, ok := v["priors"].(map[string]any); ok {
loadSignalPriors(grammar, priors)
}
continue
}
@ -299,3 +305,27 @@ func isPluralObject(m map[string]any) bool {
}
return true
}
func loadSignalPriors(grammar *GrammarData, priors map[string]any) {
if grammar == nil || len(priors) == 0 {
return
}
if grammar.Signals.Priors == nil {
grammar.Signals.Priors = make(map[string]map[string]float64, len(priors))
}
for word, raw := range priors {
bucket, ok := raw.(map[string]any)
if !ok || len(bucket) == 0 {
continue
}
key := core.Lower(word)
if grammar.Signals.Priors[key] == nil {
grammar.Signals.Priors[key] = make(map[string]float64, len(bucket))
}
for role, value := range bucket {
if score := toFloat64(value); score != 0 {
grammar.Signals.Priors[key][core.Lower(role)] = score
}
}
}
}

View file

@ -156,6 +156,14 @@ func TestFlattenWithGrammar(t *testing.T) {
"decimal": ".",
"percent": "%s%%",
},
"signal": map[string]any{
"prior": map[string]any{
"commit": map[string]any{
"verb": 0.25,
"noun": 0.75,
},
},
},
"article": map[string]any{
"indefinite": map[string]any{
"default": "a",
@ -480,6 +488,11 @@ func TestCustomFSLoader(t *testing.T) {
"draft": { "base": "draft", "past": "drafted", "gerund": "drafting" },
"zap": { "base": "zap", "past": "zapped", "gerund": "zapping" }
},
"signal": {
"priors": {
"draft": { "verb": 0.6, "noun": 0.4 }
}
},
"word": {
"hello": "Hello"
}
@ -510,4 +523,7 @@ func TestCustomFSLoader(t *testing.T) {
if v, ok := gd.Verbs["draft"]; !ok || v.Past != "drafted" {
t.Errorf("verb base override 'draft' not loaded correctly")
}
if gd.Signals.Priors["draft"]["verb"] != 0.6 || gd.Signals.Priors["draft"]["noun"] != 0.4 {
t.Errorf("signal priors not loaded correctly: %+v", gd.Signals.Priors["draft"])
}
}

View file

@ -942,8 +942,23 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
}
}
// 7. default_prior: always fires as verb signal
if w, ok := t.weights["default_prior"]; ok {
// 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
verbScore += priorVerb
nounScore += priorNoun
if t.withSignals {
components = append(components, SignalComponent{
Name: "default_prior", Weight: 1.0, Value: priorVerb, Contrib: priorVerb,
Reason: "corpus-derived prior",
})
if priorNoun > 0 {
components = append(components, SignalComponent{
Name: "default_prior", Weight: 1.0, Value: priorNoun, Contrib: priorNoun,
Reason: "corpus-derived prior",
})
}
}
} else if w, ok := t.weights["default_prior"]; ok {
verbScore += w * 1.0
if t.withSignals {
components = append(components, SignalComponent{
@ -956,6 +971,24 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
return verbScore, nounScore, components
}
func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
data := i18n.GetGrammarData(t.lang)
if data == nil || len(data.Signals.Priors) == 0 {
return 0, 0, false
}
bucket, ok := data.Signals.Priors[core.Lower(word)]
if !ok || len(bucket) == 0 {
return 0, 0, false
}
verb := bucket["verb"]
noun := bucket["noun"]
total := verb + noun
if total <= 0 {
return 0, 0, false
}
return verb / total, noun / total, true
}
// hasConfidentVerbInClause scans for a confident verb (Confidence >= 1.0)
// within the same clause as the token at idx. Clause boundaries are
// punctuation tokens and clause-boundary conjunctions/subordinators (D2).

View file

@ -491,6 +491,43 @@ func TestTokeniser_WithSignals(t *testing.T) {
_ = tok // verify it compiles and accepts the option
}
func TestTokeniser_Tokenise_CorpusPriorBias(t *testing.T) {
const lang = "zz-prior"
original := i18n.GetGrammarData(lang)
t.Cleanup(func() {
i18n.SetGrammarData(lang, original)
})
i18n.SetGrammarData(lang, &i18n.GrammarData{
Verbs: map[string]i18n.VerbForms{
"commit": {Past: "committed", Gerund: "committing"},
},
Nouns: map[string]i18n.NounForms{
"commit": {One: "commit", Other: "commits"},
},
Signals: i18n.SignalData{
Priors: map[string]map[string]float64{
"commit": {
"verb": 0.2,
"noun": 0.8,
},
},
},
})
tok := NewTokeniserForLang(lang)
tokens := tok.Tokenise("please commit")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "please commit", len(tokens))
}
if tokens[1].Type != TokenNoun {
t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenNoun", "please commit", tokens[1].Type)
}
if tokens[1].Confidence <= 0.5 {
t.Fatalf("Tokenise(%q)[1].Confidence = %f, want > 0.5", "please commit", tokens[1].Confidence)
}
}
func TestTokeniser_DualClassDetection(t *testing.T) {
setup(t)
tok := NewTokeniser()

View file

@ -228,7 +228,7 @@ type SignalData struct {
NounDeterminers []string // Words that precede nouns: "the", "a", "this", "my", ...
VerbAuxiliaries []string // Auxiliaries/modals before verbs: "is", "was", "will", ...
VerbInfinitive []string // Infinitive markers: "to"
Priors map[string]map[string]float64 // Reserved for Phase 2: corpus-derived per-word priors. Not yet loaded.
Priors map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words.
}
// --- Number Formatting ---