diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 67c9509..67761af 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -707,16 +707,18 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b } lower = normalizeFrenchApostrophes(lower) - if lower == core.Lower(data.Articles.IndefiniteDefault) || - lower == core.Lower(data.Articles.IndefiniteVowel) { - return "indefinite", true + if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.IndefiniteDefault, "indefinite"); ok { + return artType, true } - if lower == core.Lower(data.Articles.Definite) { - return "definite", true + if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.IndefiniteVowel, "indefinite"); ok { + return artType, true + } + if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.Definite, "definite"); ok { + return artType, true } for _, article := range data.Articles.ByGender { - if lower == core.Lower(article) { - return "definite", true + if artType, ok := matchConfiguredArticleCandidate(lower, article, "definite"); ok { + return artType, true } } @@ -725,16 +727,18 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b if prefix == "" { return "", false } - if prefix == core.Lower(data.Articles.IndefiniteDefault) || - prefix == core.Lower(data.Articles.IndefiniteVowel) { - return "indefinite", true + if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.IndefiniteDefault, "indefinite"); ok { + return artType, true } - if prefix == core.Lower(data.Articles.Definite) { - return "definite", true + if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.IndefiniteVowel, "indefinite"); ok { + return artType, true + } + if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.Definite, "definite"); ok { + return artType, true } for _, article := range data.Articles.ByGender { - if prefix == core.Lower(article) { - return "definite", true + if artType, ok := matchConfiguredArticleCandidate(prefix, article, "definite"); ok { + return artType, true } } } @@ -742,6 +746,34 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b return "", false } +func matchConfiguredArticleCandidate(lower, article, kind string) (string, bool) { + article = normalizeFrenchApostrophes(core.Lower(article)) + if article == "" { + return "", false + } + if lower == article { + return kind, true + } + + if !strings.HasPrefix(lower, article) { + return "", false + } + rest := strings.TrimPrefix(lower, article) + if rest == "" { + return "", false + } + if strings.HasSuffix(article, "'") { + return kind, true + } + r, _ := utf8.DecodeRuneInString(rest) + switch r { + case ' ', '\t', '\'', '’', 'ʼ': + return kind, true + default: + return "", false + } +} + func matchFrenchLeadingArticlePhrase(lower string) (string, bool) { lower = normalizeFrenchApostrophes(lower) switch { @@ -911,6 +943,21 @@ func (t *Tokeniser) Tokenise(text string) []Token { continue } } + if prefix, rest, ok := t.splitConfiguredElision(raw); ok { + if artType, ok := t.MatchArticle(prefix); ok { + tokens = append(tokens, Token{ + Raw: prefix, + Lower: normalizeFrenchApostrophes(core.Lower(prefix)), + Type: TokenArticle, + ArtType: artType, + Confidence: 1.0, + }) + } + raw = rest + if raw == "" { + continue + } + } // Strip trailing punctuation to get the clean word. word, punct := splitTrailingPunct(raw) @@ -1561,6 +1608,43 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) { return "", raw, false } +func (t *Tokeniser) splitConfiguredElision(raw string) (string, string, bool) { + if len(raw) == 0 { + return "", raw, false + } + + data := i18n.GetGrammarData(t.lang) + if data == nil { + return "", raw, false + } + + candidates := []string{data.Articles.IndefiniteDefault, data.Articles.IndefiniteVowel, data.Articles.Definite} + for _, article := range data.Articles.ByGender { + candidates = append(candidates, article) + } + + lower := normalizeFrenchApostrophes(core.Lower(raw)) + for _, article := range candidates { + article = normalizeFrenchApostrophes(core.Lower(article)) + if article == "" || !strings.Contains(article, "'") { + continue + } + if !strings.HasPrefix(lower, article) { + continue + } + if len(raw) <= len(article) { + continue + } + rest := raw[len(article):] + if rest == "" { + continue + } + return raw[:len(article)], rest, true + } + + return "", raw, false +} + func (t *Tokeniser) isFrenchLanguage() bool { lang := core.Lower(t.lang) return lang == "fr" || core.HasPrefix(lang, "fr-") diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index ee88430..ef7b6a8 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -380,6 +380,66 @@ func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) { } } +func TestTokeniser_MatchArticle_ConfiguredElisionPrefix(t *testing.T) { + setup(t) + + const lang = "xy" + prev := i18n.GetGrammarData(lang) + t.Cleanup(func() { + i18n.SetGrammarData(lang, prev) + }) + + i18n.SetGrammarData(lang, &i18n.GrammarData{ + Articles: i18n.ArticleForms{ + IndefiniteDefault: "a", + IndefiniteVowel: "an", + Definite: "l'", + ByGender: map[string]string{ + "m": "le", + "f": "la", + }, + }, + Nouns: map[string]i18n.NounForms{ + "ami": {One: "ami", Other: "amis", Gender: "m"}, + }, + }) + + tok := NewTokeniserForLang(lang) + + tests := []struct { + word string + wantType string + wantOK bool + }{ + {"l'ami", "definite", true}, + {"l’ami", "definite", true}, + {"lʼami", "definite", true}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + artType, ok := tok.MatchArticle(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK) + } + if ok && artType != tt.wantType { + t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType) + } + }) + } + + tokens := tok.Tokenise("l'ami") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'ami", len(tokens)) + } + if tokens[0].Type != TokenArticle || tokens[0].ArtType != "definite" { + t.Fatalf("Tokenise(%q)[0] = %#v, want definite article", "l'ami", tokens[0]) + } + if tokens[1].Type != TokenNoun || tokens[1].Lower != "ami" { + t.Fatalf("Tokenise(%q)[1] = %#v, want noun ami", "l'ami", tokens[1]) + } +} + func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { setup(t) tok := NewTokeniserForLang("fr")