diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index dcb91e0..c4b38c4 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -658,6 +658,9 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { } } if t.isFrenchLanguage() { + if artType, ok := matchFrenchAttachedArticle(lower); ok { + return artType, true + } switch lower { case "l'", "l’", "d'", "d’", "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’", "de l'", "de l’", "de la", "les", "au", "aux", "du": return "definite", true @@ -669,6 +672,23 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return "", false } +func matchFrenchAttachedArticle(lower string) (string, bool) { + for _, prefix := range frenchElisionPrefixes { + if !strings.HasPrefix(lower, prefix) { + continue + } + rest := strings.TrimPrefix(lower, prefix) + if rest == "" { + continue + } + if !strings.HasPrefix(rest, "'") && !strings.HasPrefix(rest, "’") { + continue + } + return "definite", true + } + return "", false +} + // tokenAmbiguous is an internal sentinel used during Pass 1 to mark // dual-class base forms that need disambiguation in Pass 2. const tokenAmbiguous TokenType = -1 diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 840a87c..dd90f60 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -303,7 +303,7 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { {"aux", "definite", true}, {"du", "definite", true}, {"des", "indefinite", true}, - {"l'enfant", "", false}, + {"l'enfant", "definite", true}, } for _, tt := range tests {