diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 9de4ecf..a8f43e6 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -671,7 +671,7 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { if base, _ := splitTrailingPunct(word); base != "" { word = base } - lower := core.Lower(word) + lower := normalizeFrenchApostrophes(core.Lower(word)) if artType, ok := matchConfiguredArticleText(lower, data); ok { return artType, true @@ -705,6 +705,7 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b if data == nil { return "", false } + lower = normalizeFrenchApostrophes(lower) if lower == core.Lower(data.Articles.IndefiniteDefault) || lower == core.Lower(data.Articles.IndefiniteVowel) { @@ -742,6 +743,7 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b } func matchFrenchLeadingArticlePhrase(lower string) (string, bool) { + lower = normalizeFrenchApostrophes(lower) switch { case lower == "le", lower == "la", lower == "les", lower == "l'", lower == "l’", lower == "au", lower == "aux": @@ -777,8 +779,9 @@ func matchFrenchLeadingArticlePhrase(lower string) (string, bool) { } func matchFrenchArticleText(lower string) (string, bool) { + lower = normalizeFrenchApostrophes(lower) switch { - case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l’"): + case strings.HasPrefix(lower, "de l'"): return "indefinite", true case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "de les "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "): return "indefinite", true @@ -821,6 +824,7 @@ func matchFrenchArticleText(lower string) (string, bool) { } func matchFrenchAttachedArticle(lower string) (string, bool) { + lower = normalizeFrenchApostrophes(lower) for _, prefix := range frenchElisionPrefixes { if !strings.HasPrefix(lower, prefix) { continue @@ -1530,7 +1534,7 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) { return "", raw, false } - lower := core.Lower(raw) + lower := normalizeFrenchApostrophes(core.Lower(raw)) if len(lower) < 2 { return "", raw, false } @@ -1562,6 +1566,13 @@ func (t *Tokeniser) isFrenchLanguage() bool { return lang == "fr" || core.HasPrefix(lang, "fr-") } +func normalizeFrenchApostrophes(s string) string { + if s == "" || !strings.ContainsRune(s, '’') { + return s + } + return strings.ReplaceAll(s, "’", "'") +} + // matchPunctuation detects known punctuation patterns. // Returns the punctuation type and true if recognised. func matchPunctuation(punct string) (string, bool) { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 92572ac..ce29822 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -319,6 +319,7 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { {"l'enfant", "definite", true}, {"de l'enfant", "indefinite", true}, {"de l’ami", "indefinite", true}, + {"De l’enfant", "indefinite", true}, } for _, tt := range tests { @@ -438,6 +439,29 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") } + tokens = tok.Tokenise("De l’enfant.") + if len(tokens) != 3 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 3", "De l’enfant.", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "enfant" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") + } + if tokens[2].Type != TokenPunctuation { + t.Fatalf("tokens[2].Type = %v, want TokenPunctuation", tokens[2].Type) + } + if tokens[2].PunctType != "sentence_end" { + t.Fatalf("tokens[2].PunctType = %q, want %q", tokens[2].PunctType, "sentence_end") + } + tokens = tok.Tokenise("de le serveur") if len(tokens) != 2 { t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de le serveur", len(tokens))