fix(reversal): recognise French article phrases
Some checks are pending
Test / test (push) Waiting to run
Security Scan / security (push) Successful in 16s

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Virgil 2026-04-02 03:54:13 +00:00
parent c21afd4263
commit 632681c0df
2 changed files with 49 additions and 0 deletions

View file

@ -683,6 +683,9 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
}
}
if t.isFrenchLanguage() {
if artType, ok := matchFrenchArticleText(lower); ok {
return artType, true
}
if artType, ok := matchFrenchAttachedArticle(lower); ok {
return artType, true
}
@ -701,6 +704,50 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
return "", false
}
func matchFrenchArticleText(lower string) (string, bool) {
switch {
case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l"):
return "indefinite", true
case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "):
return "indefinite", true
case strings.HasPrefix(lower, "au "), strings.HasPrefix(lower, "aux "):
return "definite", true
}
fields := strings.Fields(lower)
if len(fields) == 0 {
return "", false
}
switch fields[0] {
case "l'", "l", "les", "au", "aux":
return "definite", true
case "un", "une":
return "indefinite", true
case "du", "des":
return "indefinite", true
case "de":
if len(fields) >= 2 {
switch fields[1] {
case "la", "l'", "l":
return "indefinite", true
case "le", "les":
return "definite", true
}
}
case "d'", "d":
return "indefinite", true
case "j'", "j", "m'", "m", "t'", "t", "s'", "s", "n'", "n", "c'", "c", "qu'", "qu":
return "definite", true
}
if artType, ok := matchFrenchAttachedArticle(lower); ok {
return artType, true
}
return "", false
}
func matchFrenchAttachedArticle(lower string) (string, bool) {
for _, prefix := range frenchElisionPrefixes {
if !strings.HasPrefix(lower, prefix) {

View file

@ -304,6 +304,8 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
{"du", "indefinite", true},
{"des", "indefinite", true},
{"l'enfant", "definite", true},
{"de l'enfant", "indefinite", true},
{"de lami", "indefinite", true},
}
for _, tt := range tests {