feat(reversal): support french typographic elision
All checks were successful
Security Scan / security (push) Successful in 10s
Test / test (push) Successful in 1m0s

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Virgil 2026-04-01 09:41:58 +00:00
parent 18cc20ff7b
commit e71280189b
2 changed files with 32 additions and 4 deletions

View file

@ -17,6 +17,7 @@ package reversal
import (
"strings"
"unicode/utf8"
"dappco.re/go/core"
i18n "dappco.re/go/core/i18n"
@ -605,7 +606,7 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
}
if t.isFrenchLanguage() {
switch lower {
case "l'", "les":
case "l'", "l", "les":
return "definite", true
case "un", "une", "des":
return "indefinite", true
@ -1112,13 +1113,24 @@ func splitTrailingPunct(s string) (string, string) {
}
func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
if !t.isFrenchLanguage() || len(raw) <= 2 {
if !t.isFrenchLanguage() || len(raw) == 0 {
return "", raw, false
}
lower := core.Lower(raw)
if len(lower) > 2 && lower[0] == 'l' && lower[1] == '\'' {
return raw[:2], raw[2:], true
if len(lower) < 2 {
return "", raw, false
}
if lower[0] != 'l' {
return "", raw, false
}
if idx := strings.IndexAny(raw, "'"); idx == 1 {
_, size := utf8.DecodeRuneInString(raw[idx:])
if size > 0 {
return raw[:idx+size], raw[idx+size:], true
}
}
return "", raw, false

View file

@ -292,7 +292,9 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
wantOK bool
}{
{"l'", "definite", true},
{"l", "definite", true},
{"L'", "definite", true},
{"L", "definite", true},
{"les", "definite", true},
{"des", "indefinite", true},
{"l'enfant", "", false},
@ -331,6 +333,20 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("lenfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "lenfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
}
func TestTokeniser_Tokenise(t *testing.T) {