diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index bc7ac39..ba74d9b 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -683,6 +683,9 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { } } if t.isFrenchLanguage() { + if artType, ok := matchFrenchLeadingArticlePhrase(lower); ok { + return artType, true + } if artType, ok := matchFrenchArticleText(lower); ok { return artType, true } @@ -704,6 +707,41 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return "", false } +func matchFrenchLeadingArticlePhrase(lower string) (string, bool) { + switch { + case lower == "le", lower == "la", lower == "les", + lower == "l'", lower == "l’", lower == "au", lower == "aux": + return "definite", true + case lower == "un", lower == "une", lower == "du", lower == "des": + return "indefinite", true + } + + for _, prefix := range []struct { + text string + kind string + }{ + {text: "le ", kind: "definite"}, + {text: "la ", kind: "definite"}, + {text: "les ", kind: "definite"}, + {text: "un ", kind: "indefinite"}, + {text: "une ", kind: "indefinite"}, + {text: "du ", kind: "indefinite"}, + {text: "des ", kind: "indefinite"}, + {text: "au ", kind: "definite"}, + {text: "aux ", kind: "definite"}, + {text: "l'", kind: "definite"}, + {text: "l’", kind: "definite"}, + {text: "d'", kind: "indefinite"}, + {text: "d’", kind: "indefinite"}, + } { + if strings.HasPrefix(lower, prefix.text) { + return prefix.kind, true + } + } + + return "", false +} + func matchFrenchArticleText(lower string) (string, bool) { switch { case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l’"): diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index c09af7a..0660fd8 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -218,13 +218,22 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) { }{ {"le", "definite", true}, {"la", "definite", true}, + {"le serveur", "definite", true}, + {"la branche", "definite", true}, + {"les amis", "definite", true}, {"Le", "definite", true}, {"La", "definite", true}, + {"Un enfant", "indefinite", true}, + {"Une amie", "indefinite", true}, {"de la", "indefinite", true}, {"de l'", "indefinite", true}, {"de l’", "indefinite", true}, + {"du serveur", "indefinite", true}, + {"des amis", "indefinite", true}, {"un", "indefinite", true}, {"une", "indefinite", true}, + {"l'enfant", "definite", true}, + {"l’ami", "definite", true}, } for _, tt := range tests {