feat(reversal): support french typographic elision
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
18cc20ff7b
commit
e71280189b
2 changed files with 32 additions and 4 deletions
|
|
@ -17,6 +17,7 @@ package reversal
|
|||
|
||||
import (
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"dappco.re/go/core"
|
||||
i18n "dappco.re/go/core/i18n"
|
||||
|
|
@ -605,7 +606,7 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
|
|||
}
|
||||
if t.isFrenchLanguage() {
|
||||
switch lower {
|
||||
case "l'", "les":
|
||||
case "l'", "l’", "les":
|
||||
return "definite", true
|
||||
case "un", "une", "des":
|
||||
return "indefinite", true
|
||||
|
|
@ -1112,13 +1113,24 @@ func splitTrailingPunct(s string) (string, string) {
|
|||
}
|
||||
|
||||
func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
|
||||
if !t.isFrenchLanguage() || len(raw) <= 2 {
|
||||
if !t.isFrenchLanguage() || len(raw) == 0 {
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
lower := core.Lower(raw)
|
||||
if len(lower) > 2 && lower[0] == 'l' && lower[1] == '\'' {
|
||||
return raw[:2], raw[2:], true
|
||||
if len(lower) < 2 {
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
if lower[0] != 'l' {
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
if idx := strings.IndexAny(raw, "'’"); idx == 1 {
|
||||
_, size := utf8.DecodeRuneInString(raw[idx:])
|
||||
if size > 0 {
|
||||
return raw[:idx+size], raw[idx+size:], true
|
||||
}
|
||||
}
|
||||
|
||||
return "", raw, false
|
||||
|
|
|
|||
|
|
@ -292,7 +292,9 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
|
|||
wantOK bool
|
||||
}{
|
||||
{"l'", "definite", true},
|
||||
{"l’", "definite", true},
|
||||
{"L'", "definite", true},
|
||||
{"L’", "definite", true},
|
||||
{"les", "definite", true},
|
||||
{"des", "indefinite", true},
|
||||
{"l'enfant", "", false},
|
||||
|
|
@ -331,6 +333,20 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
|||
if tokens[1].Lower != "enfant" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("l’enfant")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l’enfant", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "enfant" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise(t *testing.T) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue