[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #16

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-01 05:57:38 +00:00
2 changed files with 93 additions and 0 deletions

View file

@ -598,6 +598,14 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
return "definite", true
}
}
if t.isFrenchLanguage() {
switch lower {
case "l'", "les":
return "definite", true
case "des":
return "indefinite", true
}
}
return "", false
}
@ -629,6 +637,22 @@ func (t *Tokeniser) Tokenise(text string) []Token {
// --- Pass 1: Classify & Mark ---
for _, raw := range parts {
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
if artType, ok := t.MatchArticle(prefix); ok {
tokens = append(tokens, Token{
Raw: prefix,
Lower: core.Lower(prefix),
Type: TokenArticle,
ArtType: artType,
Confidence: 1.0,
})
}
raw = rest
if raw == "" {
continue
}
}
// Strip trailing punctuation to get the clean word.
word, punct := splitTrailingPunct(raw)
@ -962,6 +986,24 @@ func splitTrailingPunct(s string) (string, string) {
return s, ""
}
func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
if !t.isFrenchLanguage() || len(raw) <= 2 {
return "", raw, false
}
lower := core.Lower(raw)
if len(lower) > 2 && lower[0] == 'l' && lower[1] == '\'' {
return raw[:2], raw[2:], true
}
return "", raw, false
}
func (t *Tokeniser) isFrenchLanguage() bool {
lang := core.Lower(t.lang)
return lang == "fr" || core.HasPrefix(lang, "fr-")
}
// matchPunctuation detects known punctuation patterns.
// Returns the punctuation type and true if recognised.
func matchPunctuation(punct string) (string, bool) {

View file

@ -241,6 +241,57 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
}
}
func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tests := []struct {
word string
wantType string
wantOK bool
}{
{"l'", "definite", true},
{"L'", "definite", true},
{"les", "definite", true},
{"des", "indefinite", true},
{"l'enfant", "", false},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
}
func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tokens := tok.Tokenise("l'enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "definite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
}
func TestTokeniser_Tokenise(t *testing.T) {
setup(t)
tok := NewTokeniser()