diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index f18e29d..c3ecfae 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -662,6 +662,14 @@ func (t *Tokeniser) Tokenise(text string) []Token { i += consumed - 1 continue } + if consumed, tok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 { + tokens = append(tokens, tok) + if punctTok != nil { + tokens = append(tokens, *punctTok) + } + i += consumed - 1 + continue + } raw := parts[i] if prefix, rest, ok := t.splitFrenchElision(raw); ok { @@ -833,6 +841,50 @@ func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Tok return 0, Token{}, nil } +func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token) { + if !t.isFrenchLanguage() || start+1 >= len(parts) { + return 0, Token{}, nil + } + + first, firstPunct := splitTrailingPunct(parts[start]) + if first == "" || firstPunct != "" { + return 0, Token{}, nil + } + second, secondPunct := splitTrailingPunct(parts[start+1]) + if second == "" { + return 0, Token{}, nil + } + + switch core.Lower(first) { + case "de": + if core.Lower(second) != "la" { + return 0, Token{}, nil + } + tok := Token{ + Raw: first + " " + second, + Lower: "de la", + Type: TokenArticle, + ArtType: "definite", + Confidence: 1.0, + } + if secondPunct != "" { + if punctType, ok := matchPunctuation(secondPunct); ok { + punctTok := Token{ + Raw: secondPunct, + Lower: secondPunct, + Type: TokenPunctuation, + PunctType: punctType, + Confidence: 1.0, + } + return 2, tok, &punctTok + } + } + return 2, tok, nil + } + + return 0, Token{}, nil +} + // resolveAmbiguous iterates all tokens and resolves any marked as // tokenAmbiguous using the weighted scoring function. func (t *Tokeniser) resolveAmbiguous(tokens []Token) { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 2e4bae2..13eb735 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -374,6 +374,31 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { } } +func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) { + setup(t) + tok := NewTokeniserForLang("fr") + + tokens := tok.Tokenise("de la branche") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de la branche", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].Lower != "de la" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la") + } + if tokens[0].ArtType != "definite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "branche" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche") + } +} + func TestTokeniser_Tokenise(t *testing.T) { setup(t) tok := NewTokeniser()