diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index a8f43e6..67c9509 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -900,7 +900,7 @@ func (t *Tokeniser) Tokenise(text string) []Token { if artType, ok := t.MatchArticle(prefix); ok { tokens = append(tokens, Token{ Raw: prefix, - Lower: core.Lower(prefix), + Lower: normalizeFrenchApostrophes(core.Lower(prefix)), Type: TokenArticle, ArtType: artType, Confidence: 1.0, @@ -1085,7 +1085,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To case "la", "le", "les", "du", "des": tok := Token{ Raw: first + " " + second, - Lower: core.Lower(first + " " + second), + Lower: normalizeFrenchApostrophes(core.Lower(first + " " + second)), Type: TokenArticle, ArtType: "indefinite", Confidence: 1.0, @@ -1104,10 +1104,10 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To } return 2, tok, nil, nil default: - if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" { + if prefix, rest, ok := t.splitFrenchElision(second); ok && normalizeFrenchApostrophes(prefix) == "l'" && rest != "" { tok := Token{ Raw: first + " " + prefix, - Lower: core.Lower(first + " " + prefix), + Lower: normalizeFrenchApostrophes(core.Lower(first + " " + prefix)), Type: TokenArticle, ArtType: "indefinite", Confidence: 1.0, @@ -1128,12 +1128,12 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To return 2, tok, &extra, punctTok } // Handle spaced elision forms such as "de l' enfant" or "de l’ enfant". - if (second == "l'" || second == "l’") && start+2 < len(parts) { + if normalizeFrenchApostrophes(second) == "l'" && start+2 < len(parts) { third, thirdPunct := splitTrailingPunct(parts[start+2]) if third != "" { tok := Token{ Raw: first + " " + second, - Lower: core.Lower(first + " " + second), + Lower: normalizeFrenchApostrophes(core.Lower(first + " " + second)), Type: TokenArticle, ArtType: "indefinite", Confidence: 1.0, @@ -1549,7 +1549,7 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) { } if idx < len(raw) { r, size := utf8.DecodeRuneInString(raw[idx:]) - if r != '\'' && r != '’' { + if !isFrenchApostrophe(r) { continue } if size > 0 { @@ -1567,10 +1567,20 @@ func (t *Tokeniser) isFrenchLanguage() bool { } func normalizeFrenchApostrophes(s string) string { - if s == "" || !strings.ContainsRune(s, '’') { + if s == "" || (!strings.ContainsRune(s, '’') && !strings.ContainsRune(s, 'ʼ')) { return s } - return strings.ReplaceAll(s, "’", "'") + s = strings.ReplaceAll(s, "’", "'") + return strings.ReplaceAll(s, "ʼ", "'") +} + +func isFrenchApostrophe(r rune) bool { + switch r { + case '\'', '’', 'ʼ': + return true + default: + return false + } } // matchPunctuation detects known punctuation patterns. diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index ce29822..ee88430 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -309,8 +309,10 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { }{ {"l'", "definite", true}, {"l’", "definite", true}, + {"lʼ", "definite", true}, {"L'", "definite", true}, {"L’", "definite", true}, + {"Lʼ", "definite", true}, {"les", "definite", true}, {"au", "definite", true}, {"aux", "definite", true}, @@ -509,8 +511,25 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { if tokens[0].Type != TokenArticle { t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) } - if tokens[0].Lower != "de l’" { - t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l’") + if tokens[0].Lower != "de l'" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "enfant" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") + } + + tokens = tok.Tokenise("de lʼenfant") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de lʼenfant", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].Lower != "de l'" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'") } if tokens[1].Type != TokenNoun { t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)