From 379cab296c2d6d1f67f785f48b04d20bc7435c9f Mon Sep 17 00:00:00 2001 From: Virgil Date: Thu, 2 Apr 2026 06:56:21 +0000 Subject: [PATCH] fix(reversal): honor base-language fallback in tokeniser Support underscore-separated locale tags like fr_CA by falling back to the base language when building reverse indexes and matching French articles. Co-Authored-By: Virgil --- reversal/tokeniser.go | 32 +++++++++++++++++++++++++------- reversal/tokeniser_test.go | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index ccbda25..0e4738b 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -165,7 +165,7 @@ func NewTokeniserForLang(lang string, opts ...TokeniserOption) *Tokeniser { // inverse lookup maps: inflected form β†’ base form. func (t *Tokeniser) buildVerbIndex() { // Tier 1: Read from JSON grammar data (via GetGrammarData). - data := i18n.GetGrammarData(t.lang) + data := t.grammarData() if data != nil && data.Verbs != nil { for base, forms := range data.Verbs { t.baseVerbs[base] = true @@ -212,7 +212,7 @@ func (t *Tokeniser) buildVerbIndex() { // inverse lookup maps: plural form β†’ base form. func (t *Tokeniser) buildNounIndex() { // Tier 1: Read from JSON grammar data (via GetGrammarData). - data := i18n.GetGrammarData(t.lang) + data := t.grammarData() if data != nil && data.Nouns != nil { for base, forms := range data.Nouns { if skipDeprecatedEnglishGrammarEntry(base) { @@ -528,7 +528,7 @@ func (t *Tokeniser) reverseRegularGerund(word string) []string { // Both the key (e.g., "url") and the display form (e.g., "URL") map back // to the key, enabling case-insensitive lookups. func (t *Tokeniser) buildWordIndex() { - data := i18n.GetGrammarData(t.lang) + data := t.grammarData() if data == nil || data.Words == nil { return } @@ -567,7 +567,7 @@ func (t *Tokeniser) buildSignalIndex() { t.verbInf = make(map[string]bool) t.verbNeg = make(map[string]bool) - data := i18n.GetGrammarData(t.lang) + data := t.grammarData() // Guard each signal list independently so partial locale data // falls back per-field rather than silently disabling signals. @@ -664,7 +664,7 @@ func (t *Tokeniser) MatchWord(word string) (string, bool) { // Returns the article type ("indefinite" or "definite") and true if matched, // or ("", false) otherwise. func (t *Tokeniser) MatchArticle(word string) (string, bool) { - data := i18n.GetGrammarData(t.lang) + data := t.grammarData() if data == nil { return "", false } @@ -1614,7 +1614,7 @@ func (t *Tokeniser) splitConfiguredElision(raw string) (string, string, bool) { return "", raw, false } - data := i18n.GetGrammarData(t.lang) + data := t.grammarData() if data == nil { return "", raw, false } @@ -1647,10 +1647,28 @@ func (t *Tokeniser) splitConfiguredElision(raw string) (string, string, bool) { } func (t *Tokeniser) isFrenchLanguage() bool { - lang := core.Lower(t.lang) + lang := tokeniserLanguageBase(t.lang) return lang == "fr" || core.HasPrefix(lang, "fr-") } +func (t *Tokeniser) grammarData() *i18n.GrammarData { + if data := i18n.GetGrammarData(t.lang); data != nil { + return data + } + if base := tokeniserLanguageBase(t.lang); base != "" { + return i18n.GetGrammarData(base) + } + return nil +} + +func tokeniserLanguageBase(lang string) string { + lang = core.Lower(core.Trim(lang)) + if idx := strings.IndexAny(lang, "-_"); idx > 0 { + return lang[:idx] + } + return lang +} + func normalizeFrenchApostrophes(s string) string { if s == "" || (!strings.ContainsRune(s, '’') && !strings.ContainsRune(s, 'ΚΌ')) { return s diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 9c36611..3fbee23 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -337,6 +337,38 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { } } +func TestTokeniser_MatchArticle_FrenchUnderscoreTagFallback(t *testing.T) { + setup(t) + tok := NewTokeniserForLang("fr_CA") + + tests := []struct { + word string + wantType string + wantOK bool + }{ + {"le", "definite", true}, + {"l'ami", "definite", true}, + {"de l'ami", "indefinite", true}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + artType, ok := tok.MatchArticle(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK) + } + if ok && artType != tt.wantType { + t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType) + } + }) + } + + tokens := tok.Tokenise("l'ami") + if len(tokens) == 0 || tokens[0].Type != TokenArticle { + t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "l'ami", tokens) + } +} + func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) { setup(t) -- 2.45.3