From 1906208025389852f9fd13bc52d2647bc34bd49e Mon Sep 17 00:00:00 2001 From: Virgil Date: Thu, 2 Apr 2026 01:34:45 +0000 Subject: [PATCH] feat(reversal): handle spaced French elision articles Co-Authored-By: Virgil --- reversal/tokeniser.go | 27 +++++++++++++++++++++++++++ reversal/tokeniser_test.go | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 13e2729..b00cfbf 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -899,6 +899,33 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To } return 2, tok, &extra, punctTok } + // Handle spaced elision forms such as "de l' enfant" or "de l’ enfant". + if (second == "l'" || second == "l’") && start+2 < len(parts) { + third, thirdPunct := splitTrailingPunct(parts[start+2]) + if third != "" { + tok := Token{ + Raw: first + " " + second, + Lower: core.Lower(first + " " + second), + Type: TokenArticle, + ArtType: "definite", + Confidence: 1.0, + } + extra := t.classifyElidedFrenchWord(third) + var punctTok *Token + if thirdPunct != "" { + if punctType, ok := matchPunctuation(thirdPunct); ok { + punctTok = &Token{ + Raw: thirdPunct, + Lower: thirdPunct, + Type: TokenPunctuation, + PunctType: punctType, + Confidence: 1.0, + } + } + } + return 3, tok, &extra, punctTok + } + } return 0, Token{}, nil, nil } tok := Token{ diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 39afe22..5fb3a05 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -356,6 +356,40 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") } + tokens = tok.Tokenise("de l' enfant") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l' enfant", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].Lower != "de l'" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "enfant" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") + } + + tokens = tok.Tokenise("de l’ enfant") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l’ enfant", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].Lower != "de l’" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l’") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "enfant" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") + } + tokens = tok.Tokenise("d'enfant") if len(tokens) != 2 { t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "d'enfant", len(tokens)) -- 2.45.3