diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index ad17075..9de4ecf 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -780,7 +780,7 @@ func matchFrenchArticleText(lower string) (string, bool) { switch { case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l’"): return "indefinite", true - case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "): + case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "de les "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "): return "indefinite", true case strings.HasPrefix(lower, "au "), strings.HasPrefix(lower, "aux "): return "definite", true @@ -801,9 +801,9 @@ func matchFrenchArticleText(lower string) (string, bool) { case "de": if len(fields) >= 2 { switch fields[1] { - case "la", "l'", "l’": + case "la", "le", "les", "l'", "l’": return "indefinite", true - case "le", "les": + case "du", "des": return "definite", true } } @@ -1077,7 +1077,29 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To switch core.Lower(first) { case "de": - if core.Lower(second) != "la" { + switch core.Lower(second) { + case "la", "le", "les", "du", "des": + tok := Token{ + Raw: first + " " + second, + Lower: core.Lower(first + " " + second), + Type: TokenArticle, + ArtType: "indefinite", + Confidence: 1.0, + } + if secondPunct != "" { + if punctType, ok := matchPunctuation(secondPunct); ok { + punctTok := Token{ + Raw: secondPunct, + Lower: secondPunct, + Type: TokenPunctuation, + PunctType: punctType, + Confidence: 1.0, + } + return 2, tok, nil, &punctTok + } + } + return 2, tok, nil, nil + default: if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" { tok := Token{ Raw: first + " " + prefix, @@ -1130,26 +1152,6 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To } return 0, Token{}, nil, nil } - tok := Token{ - Raw: first + " " + second, - Lower: "de la", - Type: TokenArticle, - ArtType: "indefinite", - Confidence: 1.0, - } - if secondPunct != "" { - if punctType, ok := matchPunctuation(secondPunct); ok { - punctTok := Token{ - Raw: secondPunct, - Lower: secondPunct, - Type: TokenPunctuation, - PunctType: punctType, - Confidence: 1.0, - } - return 2, tok, nil, &punctTok - } - } - return 2, tok, nil, nil } return 0, Token{}, nil, nil diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 3b0b8d4..92572ac 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -228,6 +228,8 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) { {"Un enfant", "indefinite", true}, {"Une amie", "indefinite", true}, {"de la", "indefinite", true}, + {"de le", "indefinite", true}, + {"de les", "indefinite", true}, {"de l'", "indefinite", true}, {"de l’", "indefinite", true}, {"du serveur", "indefinite", true}, @@ -436,6 +438,46 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) { t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant") } + tokens = tok.Tokenise("de le serveur") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de le serveur", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } + if tokens[0].Lower != "de le" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de le") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "serveur" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "serveur") + } + + tokens = tok.Tokenise("de les amis") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } + if tokens[0].Lower != "de les" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "amis" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis") + } + tokens = tok.Tokenise("de l’ enfant") if len(tokens) != 2 { t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l’ enfant", len(tokens)) @@ -519,6 +561,26 @@ func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) { if tokens[1].Lower != "branche" { t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche") } + + tokens = tok.Tokenise("de les amis") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens)) + } + if tokens[0].Type != TokenArticle { + t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type) + } + if tokens[0].Lower != "de les" { + t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les") + } + if tokens[0].ArtType != "indefinite" { + t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite") + } + if tokens[1].Type != TokenNoun { + t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type) + } + if tokens[1].Lower != "amis" { + t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis") + } } func TestTokeniser_Tokenise(t *testing.T) {