2026-04-01 05:10:57 +00:00
2 changed files with 54 additions and 14 deletions
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -50,17 +50,17 @@ const (

 // Token represents a single classified token from a text string.
 type Token struct {
-	Raw        string          // Original text as it appeared in input
-	Lower      string          // Lowercased form
-	Type       TokenType       // Classification
-	Confidence float64         // 0.0-1.0 classification confidence
-	AltType    TokenType       // Runner-up classification (dual-class only)
-	AltConf    float64         // Runner-up confidence
-	VerbInfo   VerbMatch       // Set when Type OR AltType == TokenVerb
-	NounInfo   NounMatch       // Set when Type OR AltType == TokenNoun
-	WordCat    string          // Set when Type == TokenWord
-	ArtType    string          // Set when Type == TokenArticle
-	PunctType  string          // Set when Type == TokenPunctuation
+	Raw        string           // Original text as it appeared in input
+	Lower      string           // Lowercased form
+	Type       TokenType        // Classification
+	Confidence float64          // 0.0-1.0 classification confidence
+	AltType    TokenType        // Runner-up classification (dual-class only)
+	AltConf    float64          // Runner-up confidence
+	VerbInfo   VerbMatch        // Set when Type OR AltType == TokenVerb
+	NounInfo   NounMatch        // Set when Type OR AltType == TokenNoun
+	WordCat    string           // Set when Type == TokenWord
+	ArtType    string           // Set when Type == TokenArticle
+	PunctType  string           // Set when Type == TokenPunctuation
 	Signals    *SignalBreakdown // Non-nil only when WithSignals() option is set
 }

@ -593,6 +593,11 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
 	if lower == core.Lower(data.Articles.Definite) {
 		return "definite", true
 	}
+	for _, article := range data.Articles.ByGender {
+		if lower == core.Lower(article) {
+			return "definite", true
+		}
+	}

 	return "", false
 }
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -20,9 +20,9 @@ func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
 	tok := NewTokeniser()

 	tests := []struct {
-		word    string
-		wantOK  bool
-		wantBase string
+		word      string
+		wantOK    bool
+		wantBase  string
 		wantTense string
 	}{
 		// Irregular past tense
@ -206,6 +206,41 @@ func TestTokeniser_MatchArticle(t *testing.T) {
 	}
 }

+func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
+	setup(t)
+	tok := NewTokeniserForLang("fr")
+
+	tests := []struct {
+		word     string
+		wantType string
+		wantOK   bool
+	}{
+		{"le", "definite", true},
+		{"la", "definite", true},
+		{"Le", "definite", true},
+		{"La", "definite", true},
+		{"un", "indefinite", true},
+		{"une", "", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			artType, ok := tok.MatchArticle(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if ok && artType != tt.wantType {
+				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
+			}
+		})
+	}
+
+	tokens := tok.Tokenise("la branche")
+	if len(tokens) == 0 || tokens[0].Type != TokenArticle {
+		t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "la branche", tokens)
+	}
+}
+
 func TestTokeniser_Tokenise(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser()