[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #121

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-02 05:03:29 +00:00
2 changed files with 40 additions and 11 deletions

View file

@ -900,7 +900,7 @@ func (t *Tokeniser) Tokenise(text string) []Token {
if artType, ok := t.MatchArticle(prefix); ok {
tokens = append(tokens, Token{
Raw: prefix,
Lower: core.Lower(prefix),
Lower: normalizeFrenchApostrophes(core.Lower(prefix)),
Type: TokenArticle,
ArtType: artType,
Confidence: 1.0,
@ -1085,7 +1085,7 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
case "la", "le", "les", "du", "des":
tok := Token{
Raw: first + " " + second,
Lower: core.Lower(first + " " + second),
Lower: normalizeFrenchApostrophes(core.Lower(first + " " + second)),
Type: TokenArticle,
ArtType: "indefinite",
Confidence: 1.0,
@ -1104,10 +1104,10 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
}
return 2, tok, nil, nil
default:
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l") && rest != "" {
if prefix, rest, ok := t.splitFrenchElision(second); ok && normalizeFrenchApostrophes(prefix) == "l'" && rest != "" {
tok := Token{
Raw: first + " " + prefix,
Lower: core.Lower(first + " " + prefix),
Lower: normalizeFrenchApostrophes(core.Lower(first + " " + prefix)),
Type: TokenArticle,
ArtType: "indefinite",
Confidence: 1.0,
@ -1128,12 +1128,12 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
return 2, tok, &extra, punctTok
}
// Handle spaced elision forms such as "de l' enfant" or "de l enfant".
if (second == "l'" || second == "l") && start+2 < len(parts) {
if normalizeFrenchApostrophes(second) == "l'" && start+2 < len(parts) {
third, thirdPunct := splitTrailingPunct(parts[start+2])
if third != "" {
tok := Token{
Raw: first + " " + second,
Lower: core.Lower(first + " " + second),
Lower: normalizeFrenchApostrophes(core.Lower(first + " " + second)),
Type: TokenArticle,
ArtType: "indefinite",
Confidence: 1.0,
@ -1549,7 +1549,7 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
}
if idx < len(raw) {
r, size := utf8.DecodeRuneInString(raw[idx:])
if r != '\'' && r != '' {
if !isFrenchApostrophe(r) {
continue
}
if size > 0 {
@ -1567,10 +1567,20 @@ func (t *Tokeniser) isFrenchLanguage() bool {
}
func normalizeFrenchApostrophes(s string) string {
if s == "" || !strings.ContainsRune(s, '') {
if s == "" || (!strings.ContainsRune(s, '') && !strings.ContainsRune(s, 'ʼ')) {
return s
}
return strings.ReplaceAll(s, "", "'")
s = strings.ReplaceAll(s, "", "'")
return strings.ReplaceAll(s, "ʼ", "'")
}
func isFrenchApostrophe(r rune) bool {
switch r {
case '\'', '', 'ʼ':
return true
default:
return false
}
}
// matchPunctuation detects known punctuation patterns.

View file

@ -309,8 +309,10 @@ func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
}{
{"l'", "definite", true},
{"l", "definite", true},
{"lʼ", "definite", true},
{"L'", "definite", true},
{"L", "definite", true},
{"Lʼ", "definite", true},
{"les", "definite", true},
{"au", "definite", true},
{"aux", "definite", true},
@ -509,8 +511,25 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de l" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l")
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("de lʼenfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de lʼenfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)