From 9de96a76d03ab3977da6bbf06a5e2aee93e647ab Mon Sep 17 00:00:00 2001 From: Virgil Date: Thu, 2 Apr 2026 04:05:27 +0000 Subject: [PATCH] feat(reversal): match configured article phrases Co-Authored-By: Virgil --- reversal/tokeniser.go | 53 ++++++++++++++++++++++++++++++-------- reversal/tokeniser_test.go | 43 +++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 11 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index ba74d9b..a615219 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -670,17 +670,8 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { lower := core.Lower(word) - if lower == core.Lower(data.Articles.IndefiniteDefault) || - lower == core.Lower(data.Articles.IndefiniteVowel) { - return "indefinite", true - } - if lower == core.Lower(data.Articles.Definite) { - return "definite", true - } - for _, article := range data.Articles.ByGender { - if lower == core.Lower(article) { - return "definite", true - } + if artType, ok := matchConfiguredArticleText(lower, data); ok { + return artType, true } if t.isFrenchLanguage() { if artType, ok := matchFrenchLeadingArticlePhrase(lower); ok { @@ -707,6 +698,46 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return "", false } +func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, bool) { + if data == nil { + return "", false + } + + if lower == core.Lower(data.Articles.IndefiniteDefault) || + lower == core.Lower(data.Articles.IndefiniteVowel) { + return "indefinite", true + } + if lower == core.Lower(data.Articles.Definite) { + return "definite", true + } + for _, article := range data.Articles.ByGender { + if lower == core.Lower(article) { + return "definite", true + } + } + + if idx := strings.IndexAny(lower, " \t"); idx > 0 { + prefix := core.Trim(lower[:idx]) + if prefix == "" { + return "", false + } + if prefix == core.Lower(data.Articles.IndefiniteDefault) || + prefix == core.Lower(data.Articles.IndefiniteVowel) { + return "indefinite", true + } + if prefix == core.Lower(data.Articles.Definite) { + return "definite", true + } + for _, article := range data.Articles.ByGender { + if prefix == core.Lower(article) { + return "definite", true + } + } + } + + return "", false +} + func matchFrenchLeadingArticlePhrase(lower string) (string, bool) { switch { case lower == "le", lower == "la", lower == "les", diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 0660fd8..bc6e142 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -330,6 +330,49 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { } } +func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) { + setup(t) + + const lang = "xx" + prev := i18n.GetGrammarData(lang) + t.Cleanup(func() { + i18n.SetGrammarData(lang, prev) + }) + + i18n.SetGrammarData(lang, &i18n.GrammarData{ + Articles: i18n.ArticleForms{ + IndefiniteDefault: "a", + IndefiniteVowel: "an", + Definite: "the", + }, + }) + + tok := NewTokeniserForLang(lang) + + tests := []struct { + word string + wantType string + wantOK bool + }{ + {"the file", "definite", true}, + {"a file", "indefinite", true}, + {"an error", "indefinite", true}, + {"file", "", false}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + artType, ok := tok.MatchArticle(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK) + } + if ok && artType != tt.wantType { + t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType) + } + }) + } +} + func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { setup(t) tok := NewTokeniserForLang("fr") -- 2.45.3