diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 501665a..2cdf151 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -50,17 +50,17 @@ const ( // Token represents a single classified token from a text string. type Token struct { - Raw string // Original text as it appeared in input - Lower string // Lowercased form - Type TokenType // Classification - Confidence float64 // 0.0-1.0 classification confidence - AltType TokenType // Runner-up classification (dual-class only) - AltConf float64 // Runner-up confidence - VerbInfo VerbMatch // Set when Type OR AltType == TokenVerb - NounInfo NounMatch // Set when Type OR AltType == TokenNoun - WordCat string // Set when Type == TokenWord - ArtType string // Set when Type == TokenArticle - PunctType string // Set when Type == TokenPunctuation + Raw string // Original text as it appeared in input + Lower string // Lowercased form + Type TokenType // Classification + Confidence float64 // 0.0-1.0 classification confidence + AltType TokenType // Runner-up classification (dual-class only) + AltConf float64 // Runner-up confidence + VerbInfo VerbMatch // Set when Type OR AltType == TokenVerb + NounInfo NounMatch // Set when Type OR AltType == TokenNoun + WordCat string // Set when Type == TokenWord + ArtType string // Set when Type == TokenArticle + PunctType string // Set when Type == TokenPunctuation Signals *SignalBreakdown // Non-nil only when WithSignals() option is set } @@ -593,6 +593,11 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { if lower == core.Lower(data.Articles.Definite) { return "definite", true } + for _, article := range data.Articles.ByGender { + if lower == core.Lower(article) { + return "definite", true + } + } return "", false } diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 6cff2b0..26f3bca 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -20,9 +20,9 @@ func TestTokeniser_MatchVerb_Irregular(t *testing.T) { tok := NewTokeniser() tests := []struct { - word string - wantOK bool - wantBase string + word string + wantOK bool + wantBase string wantTense string }{ // Irregular past tense @@ -206,6 +206,41 @@ func TestTokeniser_MatchArticle(t *testing.T) { } } +func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) { + setup(t) + tok := NewTokeniserForLang("fr") + + tests := []struct { + word string + wantType string + wantOK bool + }{ + {"le", "definite", true}, + {"la", "definite", true}, + {"Le", "definite", true}, + {"La", "definite", true}, + {"un", "indefinite", true}, + {"une", "", false}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + artType, ok := tok.MatchArticle(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK) + } + if ok && artType != tt.wantType { + t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType) + } + }) + } + + tokens := tok.Tokenise("la branche") + if len(tokens) == 0 || tokens[0].Type != TokenArticle { + t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "la branche", tokens) + } +} + func TestTokeniser_Tokenise(t *testing.T) { setup(t) tok := NewTokeniser()