fix(reversal): recognise uncontracted French partitive articles
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
d5060197f5
commit
297fceff2e
2 changed files with 88 additions and 24 deletions
|
|
@ -780,7 +780,7 @@ func matchFrenchArticleText(lower string) (string, bool) {
|
|||
switch {
|
||||
case strings.HasPrefix(lower, "de l'"), strings.HasPrefix(lower, "de l’"):
|
||||
return "indefinite", true
|
||||
case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "):
|
||||
case strings.HasPrefix(lower, "de la "), strings.HasPrefix(lower, "de le "), strings.HasPrefix(lower, "de les "), strings.HasPrefix(lower, "du "), strings.HasPrefix(lower, "des "):
|
||||
return "indefinite", true
|
||||
case strings.HasPrefix(lower, "au "), strings.HasPrefix(lower, "aux "):
|
||||
return "definite", true
|
||||
|
|
@ -801,9 +801,9 @@ func matchFrenchArticleText(lower string) (string, bool) {
|
|||
case "de":
|
||||
if len(fields) >= 2 {
|
||||
switch fields[1] {
|
||||
case "la", "l'", "l’":
|
||||
case "la", "le", "les", "l'", "l’":
|
||||
return "indefinite", true
|
||||
case "le", "les":
|
||||
case "du", "des":
|
||||
return "definite", true
|
||||
}
|
||||
}
|
||||
|
|
@ -1077,7 +1077,29 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
|
|||
|
||||
switch core.Lower(first) {
|
||||
case "de":
|
||||
if core.Lower(second) != "la" {
|
||||
switch core.Lower(second) {
|
||||
case "la", "le", "les", "du", "des":
|
||||
tok := Token{
|
||||
Raw: first + " " + second,
|
||||
Lower: core.Lower(first + " " + second),
|
||||
Type: TokenArticle,
|
||||
ArtType: "indefinite",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
if secondPunct != "" {
|
||||
if punctType, ok := matchPunctuation(secondPunct); ok {
|
||||
punctTok := Token{
|
||||
Raw: secondPunct,
|
||||
Lower: secondPunct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
return 2, tok, nil, &punctTok
|
||||
}
|
||||
}
|
||||
return 2, tok, nil, nil
|
||||
default:
|
||||
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" {
|
||||
tok := Token{
|
||||
Raw: first + " " + prefix,
|
||||
|
|
@ -1130,26 +1152,6 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
|
|||
}
|
||||
return 0, Token{}, nil, nil
|
||||
}
|
||||
tok := Token{
|
||||
Raw: first + " " + second,
|
||||
Lower: "de la",
|
||||
Type: TokenArticle,
|
||||
ArtType: "indefinite",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
if secondPunct != "" {
|
||||
if punctType, ok := matchPunctuation(secondPunct); ok {
|
||||
punctTok := Token{
|
||||
Raw: secondPunct,
|
||||
Lower: secondPunct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
return 2, tok, nil, &punctTok
|
||||
}
|
||||
}
|
||||
return 2, tok, nil, nil
|
||||
}
|
||||
|
||||
return 0, Token{}, nil, nil
|
||||
|
|
|
|||
|
|
@ -228,6 +228,8 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
|
|||
{"Un enfant", "indefinite", true},
|
||||
{"Une amie", "indefinite", true},
|
||||
{"de la", "indefinite", true},
|
||||
{"de le", "indefinite", true},
|
||||
{"de les", "indefinite", true},
|
||||
{"de l'", "indefinite", true},
|
||||
{"de l’", "indefinite", true},
|
||||
{"du serveur", "indefinite", true},
|
||||
|
|
@ -436,6 +438,46 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
|||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("de le serveur")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de le serveur", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].ArtType != "indefinite" {
|
||||
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
|
||||
}
|
||||
if tokens[0].Lower != "de le" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de le")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "serveur" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "serveur")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("de les amis")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].ArtType != "indefinite" {
|
||||
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
|
||||
}
|
||||
if tokens[0].Lower != "de les" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "amis" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("de l’ enfant")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l’ enfant", len(tokens))
|
||||
|
|
@ -519,6 +561,26 @@ func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
|
|||
if tokens[1].Lower != "branche" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("de les amis")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].Lower != "de les" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
|
||||
}
|
||||
if tokens[0].ArtType != "indefinite" {
|
||||
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "amis" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise(t *testing.T) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue