diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index c58bf02..bc7ac39 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -683,6 +683,9 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { } } if t.isFrenchLanguage() { + if artType, ok := matchFrenchArticleText(lower); ok { + return artType, true + } if artType, ok := matchFrenchAttachedArticle(lower); ok { return artType, true } @@ -701,6 +704,50 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return "", false } +func matchFrenchArticleText(lower string) (string, bool) { + switch { + case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l’"): + return "indefinite", true + case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "): + return "indefinite", true + case strings.HasPrefix(lower, "au "), strings.HasPrefix(lower, "aux "): + return "definite", true + } + + fields := strings.Fields(lower) + if len(fields) == 0 { + return "", false + } + + switch fields[0] { + case "l'", "l’", "les", "au", "aux": + return "definite", true + case "un", "une": + return "indefinite", true + case "du", "des": + return "indefinite", true + case "de": + if len(fields) >= 2 { + switch fields[1] { + case "la", "l'", "l’": + return "indefinite", true + case "le", "les": + return "definite", true + } + } + case "d'", "d’": + return "indefinite", true + case "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’": + return "definite", true + } + + if artType, ok := matchFrenchAttachedArticle(lower); ok { + return artType, true + } + + return "", false +} + func matchFrenchAttachedArticle(lower string) (string, bool) { for _, prefix := range frenchElisionPrefixes { if !strings.HasPrefix(lower, prefix) { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 1611fcf..c09af7a 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -304,6 +304,8 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { {"du", "indefinite", true}, {"des", "indefinite", true}, {"l'enfant", "definite", true}, + {"de l'enfant", "indefinite", true}, + {"de l’ami", "indefinite", true}, } for _, tt := range tests {