fix(reversal): accept modifier apostrophes in french elision
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
7f6240caf3
commit
2f43e2731f
2 changed files with 40 additions and 11 deletions
|
|
@ -900,7 +900,7 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
if artType, ok := t.MatchArticle(prefix); ok {
|
||||
tokens = append(tokens, Token{
|
||||
Raw: prefix,
|
||||
Lower: core.Lower(prefix),
|
||||
Lower: normalizeFrenchApostrophes(core.Lower(prefix)),
|
||||
Type: TokenArticle,
|
||||
ArtType: artType,
|
||||
Confidence: 1.0,
|
||||
|
|
@ -1085,7 +1085,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
|
|||
case "la", "le", "les", "du", "des":
|
||||
tok := Token{
|
||||
Raw: first + " " + second,
|
||||
Lower: core.Lower(first + " " + second),
|
||||
Lower: normalizeFrenchApostrophes(core.Lower(first + " " + second)),
|
||||
Type: TokenArticle,
|
||||
ArtType: "indefinite",
|
||||
Confidence: 1.0,
|
||||
|
|
@ -1104,10 +1104,10 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
|
|||
}
|
||||
return 2, tok, nil, nil
|
||||
default:
|
||||
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" {
|
||||
if prefix, rest, ok := t.splitFrenchElision(second); ok && normalizeFrenchApostrophes(prefix) == "l'" && rest != "" {
|
||||
tok := Token{
|
||||
Raw: first + " " + prefix,
|
||||
Lower: core.Lower(first + " " + prefix),
|
||||
Lower: normalizeFrenchApostrophes(core.Lower(first + " " + prefix)),
|
||||
Type: TokenArticle,
|
||||
ArtType: "indefinite",
|
||||
Confidence: 1.0,
|
||||
|
|
@ -1128,12 +1128,12 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
|
|||
return 2, tok, &extra, punctTok
|
||||
}
|
||||
// Handle spaced elision forms such as "de l' enfant" or "de l’ enfant".
|
||||
if (second == "l'" || second == "l’") && start+2 < len(parts) {
|
||||
if normalizeFrenchApostrophes(second) == "l'" && start+2 < len(parts) {
|
||||
third, thirdPunct := splitTrailingPunct(parts[start+2])
|
||||
if third != "" {
|
||||
tok := Token{
|
||||
Raw: first + " " + second,
|
||||
Lower: core.Lower(first + " " + second),
|
||||
Lower: normalizeFrenchApostrophes(core.Lower(first + " " + second)),
|
||||
Type: TokenArticle,
|
||||
ArtType: "indefinite",
|
||||
Confidence: 1.0,
|
||||
|
|
@ -1549,7 +1549,7 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
|
|||
}
|
||||
if idx < len(raw) {
|
||||
r, size := utf8.DecodeRuneInString(raw[idx:])
|
||||
if r != '\'' && r != '’' {
|
||||
if !isFrenchApostrophe(r) {
|
||||
continue
|
||||
}
|
||||
if size > 0 {
|
||||
|
|
@ -1567,10 +1567,20 @@ func (t *Tokeniser) isFrenchLanguage() bool {
|
|||
}
|
||||
|
||||
func normalizeFrenchApostrophes(s string) string {
|
||||
if s == "" || !strings.ContainsRune(s, '’') {
|
||||
if s == "" || (!strings.ContainsRune(s, '’') && !strings.ContainsRune(s, 'ʼ')) {
|
||||
return s
|
||||
}
|
||||
return strings.ReplaceAll(s, "’", "'")
|
||||
s = strings.ReplaceAll(s, "’", "'")
|
||||
return strings.ReplaceAll(s, "ʼ", "'")
|
||||
}
|
||||
|
||||
func isFrenchApostrophe(r rune) bool {
|
||||
switch r {
|
||||
case '\'', '’', 'ʼ':
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// matchPunctuation detects known punctuation patterns.
|
||||
|
|
|
|||
|
|
@ -309,8 +309,10 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
|
|||
}{
|
||||
{"l'", "definite", true},
|
||||
{"l’", "definite", true},
|
||||
{"lʼ", "definite", true},
|
||||
{"L'", "definite", true},
|
||||
{"L’", "definite", true},
|
||||
{"Lʼ", "definite", true},
|
||||
{"les", "definite", true},
|
||||
{"au", "definite", true},
|
||||
{"aux", "definite", true},
|
||||
|
|
@ -509,8 +511,25 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
|||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].Lower != "de l’" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l’")
|
||||
if tokens[0].Lower != "de l'" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "enfant" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("de lʼenfant")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de lʼenfant", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].Lower != "de l'" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue