[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #27
5 changed files with 119 additions and 3 deletions
30
loader.go
30
loader.go
|
|
@ -189,6 +189,12 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
|
|||
}
|
||||
}
|
||||
}
|
||||
if priors, ok := v["prior"].(map[string]any); ok {
|
||||
loadSignalPriors(grammar, priors)
|
||||
}
|
||||
if priors, ok := v["priors"].(map[string]any); ok {
|
||||
loadSignalPriors(grammar, priors)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
@ -299,3 +305,27 @@ func isPluralObject(m map[string]any) bool {
|
|||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func loadSignalPriors(grammar *GrammarData, priors map[string]any) {
|
||||
if grammar == nil || len(priors) == 0 {
|
||||
return
|
||||
}
|
||||
if grammar.Signals.Priors == nil {
|
||||
grammar.Signals.Priors = make(map[string]map[string]float64, len(priors))
|
||||
}
|
||||
for word, raw := range priors {
|
||||
bucket, ok := raw.(map[string]any)
|
||||
if !ok || len(bucket) == 0 {
|
||||
continue
|
||||
}
|
||||
key := core.Lower(word)
|
||||
if grammar.Signals.Priors[key] == nil {
|
||||
grammar.Signals.Priors[key] = make(map[string]float64, len(bucket))
|
||||
}
|
||||
for role, value := range bucket {
|
||||
if score := toFloat64(value); score != 0 {
|
||||
grammar.Signals.Priors[key][core.Lower(role)] = score
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -156,6 +156,14 @@ func TestFlattenWithGrammar(t *testing.T) {
|
|||
"decimal": ".",
|
||||
"percent": "%s%%",
|
||||
},
|
||||
"signal": map[string]any{
|
||||
"prior": map[string]any{
|
||||
"commit": map[string]any{
|
||||
"verb": 0.25,
|
||||
"noun": 0.75,
|
||||
},
|
||||
},
|
||||
},
|
||||
"article": map[string]any{
|
||||
"indefinite": map[string]any{
|
||||
"default": "a",
|
||||
|
|
@ -480,6 +488,11 @@ func TestCustomFSLoader(t *testing.T) {
|
|||
"draft": { "base": "draft", "past": "drafted", "gerund": "drafting" },
|
||||
"zap": { "base": "zap", "past": "zapped", "gerund": "zapping" }
|
||||
},
|
||||
"signal": {
|
||||
"priors": {
|
||||
"draft": { "verb": 0.6, "noun": 0.4 }
|
||||
}
|
||||
},
|
||||
"word": {
|
||||
"hello": "Hello"
|
||||
}
|
||||
|
|
@ -510,4 +523,7 @@ func TestCustomFSLoader(t *testing.T) {
|
|||
if v, ok := gd.Verbs["draft"]; !ok || v.Past != "drafted" {
|
||||
t.Errorf("verb base override 'draft' not loaded correctly")
|
||||
}
|
||||
if gd.Signals.Priors["draft"]["verb"] != 0.6 || gd.Signals.Priors["draft"]["noun"] != 0.4 {
|
||||
t.Errorf("signal priors not loaded correctly: %+v", gd.Signals.Priors["draft"])
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -942,8 +942,23 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
}
|
||||
}
|
||||
|
||||
// 7. default_prior: always fires as verb signal
|
||||
if w, ok := t.weights["default_prior"]; ok {
|
||||
// 7. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
|
||||
if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
|
||||
verbScore += priorVerb
|
||||
nounScore += priorNoun
|
||||
if t.withSignals {
|
||||
components = append(components, SignalComponent{
|
||||
Name: "default_prior", Weight: 1.0, Value: priorVerb, Contrib: priorVerb,
|
||||
Reason: "corpus-derived prior",
|
||||
})
|
||||
if priorNoun > 0 {
|
||||
components = append(components, SignalComponent{
|
||||
Name: "default_prior", Weight: 1.0, Value: priorNoun, Contrib: priorNoun,
|
||||
Reason: "corpus-derived prior",
|
||||
})
|
||||
}
|
||||
}
|
||||
} else if w, ok := t.weights["default_prior"]; ok {
|
||||
verbScore += w * 1.0
|
||||
if t.withSignals {
|
||||
components = append(components, SignalComponent{
|
||||
|
|
@ -956,6 +971,24 @@ func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, [
|
|||
return verbScore, nounScore, components
|
||||
}
|
||||
|
||||
func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data == nil || len(data.Signals.Priors) == 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
bucket, ok := data.Signals.Priors[core.Lower(word)]
|
||||
if !ok || len(bucket) == 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
verb := bucket["verb"]
|
||||
noun := bucket["noun"]
|
||||
total := verb + noun
|
||||
if total <= 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
return verb / total, noun / total, true
|
||||
}
|
||||
|
||||
// hasConfidentVerbInClause scans for a confident verb (Confidence >= 1.0)
|
||||
// within the same clause as the token at idx. Clause boundaries are
|
||||
// punctuation tokens and clause-boundary conjunctions/subordinators (D2).
|
||||
|
|
|
|||
|
|
@ -491,6 +491,43 @@ func TestTokeniser_WithSignals(t *testing.T) {
|
|||
_ = tok // verify it compiles and accepts the option
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_CorpusPriorBias(t *testing.T) {
|
||||
const lang = "zz-prior"
|
||||
original := i18n.GetGrammarData(lang)
|
||||
t.Cleanup(func() {
|
||||
i18n.SetGrammarData(lang, original)
|
||||
})
|
||||
|
||||
i18n.SetGrammarData(lang, &i18n.GrammarData{
|
||||
Verbs: map[string]i18n.VerbForms{
|
||||
"commit": {Past: "committed", Gerund: "committing"},
|
||||
},
|
||||
Nouns: map[string]i18n.NounForms{
|
||||
"commit": {One: "commit", Other: "commits"},
|
||||
},
|
||||
Signals: i18n.SignalData{
|
||||
Priors: map[string]map[string]float64{
|
||||
"commit": {
|
||||
"verb": 0.2,
|
||||
"noun": 0.8,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
tok := NewTokeniserForLang(lang)
|
||||
tokens := tok.Tokenise("please commit")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "please commit", len(tokens))
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenNoun", "please commit", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Confidence <= 0.5 {
|
||||
t.Fatalf("Tokenise(%q)[1].Confidence = %f, want > 0.5", "please commit", tokens[1].Confidence)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_DualClassDetection(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
|
|
|||
2
types.go
2
types.go
|
|
@ -228,7 +228,7 @@ type SignalData struct {
|
|||
NounDeterminers []string // Words that precede nouns: "the", "a", "this", "my", ...
|
||||
VerbAuxiliaries []string // Auxiliaries/modals before verbs: "is", "was", "will", ...
|
||||
VerbInfinitive []string // Infinitive markers: "to"
|
||||
Priors map[string]map[string]float64 // Reserved for Phase 2: corpus-derived per-word priors. Not yet loaded.
|
||||
Priors map[string]map[string]float64 // Corpus-derived verb/noun priors for ambiguous words.
|
||||
}
|
||||
|
||||
// --- Number Formatting ---
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue