2026-04-02 00:20:21 +00:00
2 changed files with 77 additions and 0 deletions
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -662,6 +662,14 @@ func (t *Tokeniser) Tokenise(text string) []Token {
 			i += consumed - 1
 			continue
 		}
+		if consumed, tok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
+			tokens = append(tokens, tok)
+			if punctTok != nil {
+				tokens = append(tokens, *punctTok)
+			}
+			i += consumed - 1
+			continue
+		}

 		raw := parts[i]
 		if prefix, rest, ok := t.splitFrenchElision(raw); ok {
@ -833,6 +841,50 @@ func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Tok
 	return 0, Token{}, nil
 }

+func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token) {
+	if !t.isFrenchLanguage() || start+1 >= len(parts) {
+		return 0, Token{}, nil
+	}
+
+	first, firstPunct := splitTrailingPunct(parts[start])
+	if first == "" || firstPunct != "" {
+		return 0, Token{}, nil
+	}
+	second, secondPunct := splitTrailingPunct(parts[start+1])
+	if second == "" {
+		return 0, Token{}, nil
+	}
+
+	switch core.Lower(first) {
+	case "de":
+		if core.Lower(second) != "la" {
+			return 0, Token{}, nil
+		}
+		tok := Token{
+			Raw:        first + " " + second,
+			Lower:      "de la",
+			Type:       TokenArticle,
+			ArtType:    "definite",
+			Confidence: 1.0,
+		}
+		if secondPunct != "" {
+			if punctType, ok := matchPunctuation(secondPunct); ok {
+				punctTok := Token{
+					Raw:        secondPunct,
+					Lower:      secondPunct,
+					Type:       TokenPunctuation,
+					PunctType:  punctType,
+					Confidence: 1.0,
+				}
+				return 2, tok, &punctTok
+			}
+		}
+		return 2, tok, nil
+	}
+
+	return 0, Token{}, nil
+}
+
 // resolveAmbiguous iterates all tokens and resolves any marked as
 // tokenAmbiguous using the weighted scoring function.
 func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -374,6 +374,31 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
 	}
 }

+func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
+	setup(t)
+	tok := NewTokeniserForLang("fr")
+
+	tokens := tok.Tokenise("de la branche")
+	if len(tokens) != 2 {
+		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de la branche", len(tokens))
+	}
+	if tokens[0].Type != TokenArticle {
+		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
+	}
+	if tokens[0].Lower != "de la" {
+		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la")
+	}
+	if tokens[0].ArtType != "definite" {
+		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
+	}
+	if tokens[1].Type != TokenNoun {
+		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
+	}
+	if tokens[1].Lower != "branche" {
+		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
+	}
+}
+
 func TestTokeniser_Tokenise(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser()