From 5d81842930c99d0089ff8d7fa23aed25735ea55a Mon Sep 17 00:00:00 2001 From: Virgil Date: Thu, 2 Apr 2026 03:33:09 +0000 Subject: [PATCH] fix(reversal): classify French partitive articles Co-Authored-By: Virgil --- reversal/tokeniser.go | 23 +++++++++++++++++------ reversal/tokeniser_test.go | 24 ++++++++++++++++++------ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 7192b17..16aa366 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -662,9 +662,13 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return artType, true } switch lower { - case "l'", "l’", "d'", "d’", "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’", "de l'", "de l’", "de la", "les", "au", "aux", "du": + case "l'", "l’", "les", "au", "aux": return "definite", true - case "un", "une", "des": + case "d'", "d’", "de l'", "de l’", "de la", "du", "des": + return "indefinite", true + case "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’": + return "definite", true + case "un", "une": return "indefinite", true } } @@ -684,7 +688,14 @@ func matchFrenchAttachedArticle(lower string) (string, bool) { if !strings.HasPrefix(rest, "'") && !strings.HasPrefix(rest, "’") { continue } - return "definite", true + switch prefix { + case "d": + return "indefinite", true + case "l": + return "definite", true + default: + return "definite", true + } } return "", false } @@ -928,7 +939,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To Raw: first + " " + prefix, Lower: core.Lower(first + " " + prefix), Type: TokenArticle, - ArtType: "definite", + ArtType: "indefinite", Confidence: 1.0, } extra := t.classifyElidedFrenchWord(rest) @@ -954,7 +965,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To Raw: first + " " + second, Lower: core.Lower(first + " " + second), Type: TokenArticle, - ArtType: "definite", + ArtType: "indefinite", Confidence: 1.0, } extra := t.classifyElidedFrenchWord(third) @@ -979,7 +990,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To Raw: first + " " + second, Lower: "de la", Type: TokenArticle, - ArtType: "definite", + ArtType: "indefinite", Confidence: 1.0, } if secondPunct != "" { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 0a0a322..16fa1f9 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -220,9 +220,9 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) { {"la", "definite", true}, {"Le", "definite", true}, {"La", "definite", true}, - {"de la", "definite", true}, - {"de l'", "definite", true}, - {"de l’", "definite", true}, + {"de la", "indefinite", true}, + {"de l'", "indefinite", true}, + {"de l’", "indefinite", true}, {"un", "indefinite", true}, {"une", "indefinite", true}, } @@ -301,7 +301,7 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { {"les", "definite", true}, {"au", "definite", true}, {"aux", "definite", true}, - {"du", "definite", true}, + {"du", "indefinite", true}, {"des", "indefinite", true}, {"l'enfant", "definite", true}, } @@ -347,6 +347,9 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { if tokens[0].Type != TokenArticle { t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } if tokens[0].Lower != "de l'" { t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'") } @@ -364,6 +367,9 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { if tokens[0].Type != TokenArticle { t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } if tokens[0].Lower != "de l'" { t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'") } @@ -398,6 +404,9 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { if tokens[0].Type != TokenArticle { t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } if tokens[1].Type != TokenNoun { t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) } @@ -409,6 +418,9 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { if tokens[0].Type != TokenArticle { t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) } + if tokens[0].ArtType != "definite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite") + } if tokens[1].Type != TokenNoun { t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) } @@ -442,8 +454,8 @@ func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) { if tokens[0].Lower != "de la" { t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la") } - if tokens[0].ArtType != "definite" { - t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite") + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") } if tokens[1].Type != TokenNoun { t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) -- 2.45.3