From e71280189b1c059a149d749d032b06764d1e1e67 Mon Sep 17 00:00:00 2001 From: Virgil Date: Wed, 1 Apr 2026 09:41:58 +0000 Subject: [PATCH] feat(reversal): support french typographic elision Co-Authored-By: Virgil --- reversal/tokeniser.go | 20 ++++++++++++++++---- reversal/tokeniser_test.go | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 0f4e39f..58de779 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -17,6 +17,7 @@ package reversal import ( "strings" + "unicode/utf8" "dappco.re/go/core" i18n "dappco.re/go/core/i18n" @@ -605,7 +606,7 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { } if t.isFrenchLanguage() { switch lower { - case "l'", "les": + case "l'", "l’", "les": return "definite", true case "un", "une", "des": return "indefinite", true @@ -1112,13 +1113,24 @@ func splitTrailingPunct(s string) (string, string) { } func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) { - if !t.isFrenchLanguage() || len(raw) <= 2 { + if !t.isFrenchLanguage() || len(raw) == 0 { return "", raw, false } lower := core.Lower(raw) - if len(lower) > 2 && lower[0] == 'l' && lower[1] == '\'' { - return raw[:2], raw[2:], true + if len(lower) < 2 { + return "", raw, false + } + + if lower[0] != 'l' { + return "", raw, false + } + + if idx := strings.IndexAny(raw, "'’"); idx == 1 { + _, size := utf8.DecodeRuneInString(raw[idx:]) + if size > 0 { + return raw[:idx+size], raw[idx+size:], true + } } return "", raw, false diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 09cfb99..da1587d 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -292,7 +292,9 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { wantOK bool }{ {"l'", "definite", true}, + {"l’", "definite", true}, {"L'", "definite", true}, + {"L’", "definite", true}, {"les", "definite", true}, {"des", "indefinite", true}, {"l'enfant", "", false}, @@ -331,6 +333,20 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { if tokens[1].Lower != "enfant" { t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") } + + tokens = tok.Tokenise("l’enfant") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l’enfant", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "enfant" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") + } } func TestTokeniser_Tokenise(t *testing.T) { -- 2.45.3