From a499643f33837daf5a9e4dde866d1f08a2e97c89 Mon Sep 17 00:00:00 2001 From: Virgil Date: Wed, 1 Apr 2026 05:57:25 +0000 Subject: [PATCH] feat(reversal): recognise French article elision Support French article reversal for l', les, and des, and split elided l' prefixes during tokenisation. Co-Authored-By: Virgil --- reversal/tokeniser.go | 42 +++++++++++++++++++++++++++++++ reversal/tokeniser_test.go | 51 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 2cdf151..c25dfdb 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -598,6 +598,14 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return "definite", true } } + if t.isFrenchLanguage() { + switch lower { + case "l'", "les": + return "definite", true + case "des": + return "indefinite", true + } + } return "", false } @@ -629,6 +637,22 @@ func (t *Tokeniser) Tokenise(text string) []Token { // --- Pass 1: Classify & Mark --- for _, raw := range parts { + if prefix, rest, ok := t.splitFrenchElision(raw); ok { + if artType, ok := t.MatchArticle(prefix); ok { + tokens = append(tokens, Token{ + Raw: prefix, + Lower: core.Lower(prefix), + Type: TokenArticle, + ArtType: artType, + Confidence: 1.0, + }) + } + raw = rest + if raw == "" { + continue + } + } + // Strip trailing punctuation to get the clean word. word, punct := splitTrailingPunct(raw) @@ -962,6 +986,24 @@ func splitTrailingPunct(s string) (string, string) { return s, "" } +func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) { + if !t.isFrenchLanguage() || len(raw) <= 2 { + return "", raw, false + } + + lower := core.Lower(raw) + if len(lower) > 2 && lower[0] == 'l' && lower[1] == '\'' { + return raw[:2], raw[2:], true + } + + return "", raw, false +} + +func (t *Tokeniser) isFrenchLanguage() bool { + lang := core.Lower(t.lang) + return lang == "fr" || core.HasPrefix(lang, "fr-") +} + // matchPunctuation detects known punctuation patterns. // Returns the punctuation type and true if recognised. func matchPunctuation(punct string) (string, bool) { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 26f3bca..7b642ad 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -241,6 +241,57 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) { } } +func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { + setup(t) + tok := NewTokeniserForLang("fr") + + tests := []struct { + word string + wantType string + wantOK bool + }{ + {"l'", "definite", true}, + {"L'", "definite", true}, + {"les", "definite", true}, + {"des", "indefinite", true}, + {"l'enfant", "", false}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + artType, ok := tok.MatchArticle(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK) + } + if ok && artType != tt.wantType { + t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType) + } + }) + } +} + +func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { + setup(t) + tok := NewTokeniserForLang("fr") + + tokens := tok.Tokenise("l'enfant") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'enfant", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].ArtType != "definite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "enfant" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") + } +} + func TestTokeniser_Tokenise(t *testing.T) { setup(t) tok := NewTokeniser() -- 2.45.3