fix(reversal): recognise uncontracted French partitive articles
Some checks failed
Security Scan / security (push) Successful in 14s
Test / test (push) Has been cancelled

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Virgil 2026-04-02 04:45:45 +00:00
parent d5060197f5
commit 297fceff2e
2 changed files with 88 additions and 24 deletions

View file

@ -780,7 +780,7 @@ func matchFrenchArticleText(lower string) (string, bool) {
switch {
case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l"):
return "indefinite", true
case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "):
case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "de les "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "):
return "indefinite", true
case strings.HasPrefix(lower, "au "), strings.HasPrefix(lower, "aux "):
return "definite", true
@ -801,9 +801,9 @@ func matchFrenchArticleText(lower string) (string, bool) {
case "de":
if len(fields) >= 2 {
switch fields[1] {
case "la", "l'", "l":
case "la", "le", "les", "l'", "l":
return "indefinite", true
case "le", "les":
case "du", "des":
return "definite", true
}
}
@ -1077,7 +1077,29 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
switch core.Lower(first) {
case "de":
if core.Lower(second) != "la" {
switch core.Lower(second) {
case "la", "le", "les", "du", "des":
tok := Token{
Raw: first + " " + second,
Lower: core.Lower(first + " " + second),
Type: TokenArticle,
ArtType: "indefinite",
Confidence: 1.0,
}
if secondPunct != "" {
if punctType, ok := matchPunctuation(secondPunct); ok {
punctTok := Token{
Raw: secondPunct,
Lower: secondPunct,
Type: TokenPunctuation,
PunctType: punctType,
Confidence: 1.0,
}
return 2, tok, nil, &punctTok
}
}
return 2, tok, nil, nil
default:
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l") && rest != "" {
tok := Token{
Raw: first + " " + prefix,
@ -1130,26 +1152,6 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
}
return 0, Token{}, nil, nil
}
tok := Token{
Raw: first + " " + second,
Lower: "de la",
Type: TokenArticle,
ArtType: "indefinite",
Confidence: 1.0,
}
if secondPunct != "" {
if punctType, ok := matchPunctuation(secondPunct); ok {
punctTok := Token{
Raw: secondPunct,
Lower: secondPunct,
Type: TokenPunctuation,
PunctType: punctType,
Confidence: 1.0,
}
return 2, tok, nil, &punctTok
}
}
return 2, tok, nil, nil
}
return 0, Token{}, nil, nil

View file

@ -228,6 +228,8 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
{"Un enfant", "indefinite", true},
{"Une amie", "indefinite", true},
{"de la", "indefinite", true},
{"de le", "indefinite", true},
{"de les", "indefinite", true},
{"de l'", "indefinite", true},
{"de l", "indefinite", true},
{"du serveur", "indefinite", true},
@ -436,6 +438,46 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("de le serveur")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de le serveur", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[0].Lower != "de le" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de le")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "serveur" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "serveur")
}
tokens = tok.Tokenise("de les amis")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[0].Lower != "de les" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "amis" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
}
tokens = tok.Tokenise("de l enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l enfant", len(tokens))
@ -519,6 +561,26 @@ func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
if tokens[1].Lower != "branche" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
}
tokens = tok.Tokenise("de les amis")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de les" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "amis" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
}
}
func TestTokeniser_Tokenise(t *testing.T) {