2026-04-02 05:03:29 +00:00
2 changed files with 40 additions and 11 deletions
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -900,7 +900,7 @@ func (t *Tokeniser) Tokenise(text string) []Token {
 			if artType, ok := t.MatchArticle(prefix); ok {
 				tokens = append(tokens, Token{
 					Raw:        prefix,
-					Lower:      core.Lower(prefix),
+					Lower:      normalizeFrenchApostrophes(core.Lower(prefix)),
 					Type:       TokenArticle,
 					ArtType:    artType,
 					Confidence: 1.0,
@ -1085,7 +1085,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
 		case "la", "le", "les", "du", "des":
 			tok := Token{
 				Raw:        first + " " + second,
-				Lower:      core.Lower(first + " " + second),
+				Lower:      normalizeFrenchApostrophes(core.Lower(first + " " + second)),
 				Type:       TokenArticle,
 				ArtType:    "indefinite",
 				Confidence: 1.0,
@ -1104,10 +1104,10 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
 			}
 			return 2, tok, nil, nil
 		default:
-			if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" {
+			if prefix, rest, ok := t.splitFrenchElision(second); ok && normalizeFrenchApostrophes(prefix) == "l'" && rest != "" {
 				tok := Token{
 					Raw:        first + " " + prefix,
-					Lower:      core.Lower(first + " " + prefix),
+					Lower:      normalizeFrenchApostrophes(core.Lower(first + " " + prefix)),
 					Type:       TokenArticle,
 					ArtType:    "indefinite",
 					Confidence: 1.0,
@ -1128,12 +1128,12 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
 				return 2, tok, &extra, punctTok
 			}
 			// Handle spaced elision forms such as "de l' enfant" or "de l’ enfant".
-			if (second == "l'" || second == "l’") && start+2 < len(parts) {
+			if normalizeFrenchApostrophes(second) == "l'" && start+2 < len(parts) {
 				third, thirdPunct := splitTrailingPunct(parts[start+2])
 				if third != "" {
 					tok := Token{
 						Raw:        first + " " + second,
-						Lower:      core.Lower(first + " " + second),
+						Lower:      normalizeFrenchApostrophes(core.Lower(first + " " + second)),
 						Type:       TokenArticle,
 						ArtType:    "indefinite",
 						Confidence: 1.0,
@ -1549,7 +1549,7 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
 		}
 		if idx < len(raw) {
 			r, size := utf8.DecodeRuneInString(raw[idx:])
-			if r != '\'' && r != '’' {
+			if !isFrenchApostrophe(r) {
 				continue
 			}
 			if size > 0 {
@ -1567,10 +1567,20 @@ func (t *Tokeniser) isFrenchLanguage() bool {
 }

 func normalizeFrenchApostrophes(s string) string {
-	if s == "" || !strings.ContainsRune(s, '’') {
+	if s == "" || (!strings.ContainsRune(s, '’') && !strings.ContainsRune(s, 'ʼ')) {
 		return s
 	}
-	return strings.ReplaceAll(s, "’", "'")
+	s = strings.ReplaceAll(s, "’", "'")
+	return strings.ReplaceAll(s, "ʼ", "'")
+}
+
+func isFrenchApostrophe(r rune) bool {
+	switch r {
+	case '\'', '’', 'ʼ':
+		return true
+	default:
+		return false
+	}
 }

 // matchPunctuation detects known punctuation patterns.
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -309,8 +309,10 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
 	}{
 		{"l'", "definite", true},
 		{"l’", "definite", true},
+		{"lʼ", "definite", true},
 		{"L'", "definite", true},
 		{"L’", "definite", true},
+		{"Lʼ", "definite", true},
 		{"les", "definite", true},
 		{"au", "definite", true},
 		{"aux", "definite", true},
@ -509,8 +511,25 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
 	if tokens[0].Type != TokenArticle {
 		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
 	}
-	if tokens[0].Lower != "de l’" {
-		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l’")
+	if tokens[0].Lower != "de l'" {
+		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
+	}
+	if tokens[1].Type != TokenNoun {
+		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
+	}
+	if tokens[1].Lower != "enfant" {
+		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
+	}
+
+	tokens = tok.Tokenise("de lʼenfant")
+	if len(tokens) != 2 {
+		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de lʼenfant", len(tokens))
+	}
+	if tokens[0].Type != TokenArticle {
+		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
+	}
+	if tokens[0].Lower != "de l'" {
+		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
 	}
 	if tokens[1].Type != TokenNoun {
 		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)