fix(i18n): ignore deprecated grammar entries
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
6758585c36
commit
eeffe92da0
4 changed files with 88 additions and 1 deletions
16
loader.go
16
loader.go
|
|
@ -108,6 +108,9 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
|
|||
case string:
|
||||
if grammar != nil && core.HasPrefix(fullKey, "gram.word.") {
|
||||
wordKey := core.TrimPrefix(fullKey, "gram.word.")
|
||||
if shouldSkipDeprecatedEnglishGrammarEntry(fullKey) {
|
||||
continue
|
||||
}
|
||||
grammar.Words[core.Lower(wordKey)] = v
|
||||
continue
|
||||
}
|
||||
|
|
@ -142,6 +145,9 @@ func flattenWithGrammar(prefix string, data map[string]any, out map[string]Messa
|
|||
if after, ok := strings.CutPrefix(fullKey, "gram.noun."); ok {
|
||||
nounName = after
|
||||
}
|
||||
if shouldSkipDeprecatedEnglishGrammarEntry(fullKey) {
|
||||
continue
|
||||
}
|
||||
_, hasOne := v["one"]
|
||||
_, hasOther := v["other"]
|
||||
if hasOne && hasOther {
|
||||
|
|
@ -338,3 +344,13 @@ func loadSignalPriors(grammar *GrammarData, priors map[string]any) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func shouldSkipDeprecatedEnglishGrammarEntry(fullKey string) bool {
|
||||
switch fullKey {
|
||||
case "gram.noun.passed", "gram.noun.failed", "gram.noun.skipped",
|
||||
"gram.word.passed", "gram.word.failed", "gram.word.skipped":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -149,9 +149,15 @@ func TestFlattenWithGrammar(t *testing.T) {
|
|||
"one": "widget",
|
||||
"other": "widgets",
|
||||
},
|
||||
"passed": map[string]any{
|
||||
"one": "passed",
|
||||
"other": "passed",
|
||||
},
|
||||
},
|
||||
"word": map[string]any{
|
||||
"api": "API",
|
||||
"api": "API",
|
||||
"failed": "failed",
|
||||
"skipped": "skipped",
|
||||
},
|
||||
"punct": map[string]any{
|
||||
"label": ":",
|
||||
|
|
@ -233,11 +239,20 @@ func TestFlattenWithGrammar(t *testing.T) {
|
|||
t.Errorf("widget.other = %q, want 'widgets'", n.Other)
|
||||
}
|
||||
}
|
||||
if _, ok := grammar.Nouns["passed"]; ok {
|
||||
t.Error("deprecated noun 'passed' should be ignored")
|
||||
}
|
||||
|
||||
// Word extracted
|
||||
if grammar.Words["api"] != "API" {
|
||||
t.Errorf("word 'api' = %q, want 'API'", grammar.Words["api"])
|
||||
}
|
||||
if _, ok := grammar.Words["failed"]; ok {
|
||||
t.Error("deprecated word 'failed' should be ignored")
|
||||
}
|
||||
if _, ok := grammar.Words["skipped"]; ok {
|
||||
t.Error("deprecated word 'skipped' should be ignored")
|
||||
}
|
||||
|
||||
// Punct extracted
|
||||
if grammar.Punct.LabelSuffix != ":" {
|
||||
|
|
|
|||
|
|
@ -202,6 +202,9 @@ func (t *Tokeniser) buildNounIndex() {
|
|||
data := i18n.GetGrammarData(t.lang)
|
||||
if data != nil && data.Nouns != nil {
|
||||
for base, forms := range data.Nouns {
|
||||
if skipDeprecatedEnglishGrammarEntry(base) {
|
||||
continue
|
||||
}
|
||||
t.baseNouns[base] = true
|
||||
if forms.Other != "" && forms.Other != base {
|
||||
t.pluralToBase[forms.Other] = base
|
||||
|
|
@ -505,6 +508,9 @@ func (t *Tokeniser) buildWordIndex() {
|
|||
return
|
||||
}
|
||||
for key, display := range data.Words {
|
||||
if skipDeprecatedEnglishGrammarEntry(key) {
|
||||
continue
|
||||
}
|
||||
// Map the key itself (already lowercase)
|
||||
t.words[core.Lower(key)] = key
|
||||
// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
|
||||
|
|
@ -612,6 +618,15 @@ func defaultWeights() map[string]float64 {
|
|||
}
|
||||
}
|
||||
|
||||
func skipDeprecatedEnglishGrammarEntry(key string) bool {
|
||||
switch core.Lower(key) {
|
||||
case "passed", "failed", "skipped":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// MatchWord performs a case-insensitive lookup in the words map.
|
||||
// Returns the category key and true if found, or ("", false) otherwise.
|
||||
func (t *Tokeniser) MatchWord(word string) (string, bool) {
|
||||
|
|
|
|||
|
|
@ -667,6 +667,47 @@ func TestTokeniser_DualClassDetection(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_IgnoresDeprecatedGrammarEntries(t *testing.T) {
|
||||
setup(t)
|
||||
|
||||
const lang = "zz-deprecated"
|
||||
original := i18n.GetGrammarData(lang)
|
||||
t.Cleanup(func() {
|
||||
i18n.SetGrammarData(lang, original)
|
||||
})
|
||||
|
||||
i18n.SetGrammarData(lang, &i18n.GrammarData{
|
||||
Nouns: map[string]i18n.NounForms{
|
||||
"passed": {One: "passed", Other: "passed"},
|
||||
"failed": {One: "failed", Other: "failed"},
|
||||
"skipped": {One: "skipped", Other: "skipped"},
|
||||
"commit": {One: "commit", Other: "commits"},
|
||||
},
|
||||
Words: map[string]string{
|
||||
"passed": "passed",
|
||||
"failed": "failed",
|
||||
"skipped": "skipped",
|
||||
"url": "URL",
|
||||
},
|
||||
})
|
||||
|
||||
tok := NewTokeniserForLang(lang)
|
||||
for _, word := range []string{"passed", "failed", "skipped"} {
|
||||
if tok.IsDualClass(word) {
|
||||
t.Fatalf("%q should not be treated as dual-class", word)
|
||||
}
|
||||
if cat, ok := tok.MatchWord(word); ok {
|
||||
t.Fatalf("MatchWord(%q) = %q, %v; want not found", word, cat, ok)
|
||||
}
|
||||
if _, ok := tok.MatchNoun(word); ok {
|
||||
t.Fatalf("MatchNoun(%q) should be ignored", word)
|
||||
}
|
||||
}
|
||||
if cat, ok := tok.MatchWord("url"); !ok || cat != "url" {
|
||||
t.Fatalf("MatchWord(%q) = %q, %v; want %q, true", "url", cat, ok, "url")
|
||||
}
|
||||
}
|
||||
|
||||
func TestToken_ConfidenceField(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue