[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #125

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-02 05:20:22 +00:00
2 changed files with 158 additions and 14 deletions

View file

@ -707,16 +707,18 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b
}
lower = normalizeFrenchApostrophes(lower)
if lower == core.Lower(data.Articles.IndefiniteDefault) ||
lower == core.Lower(data.Articles.IndefiniteVowel) {
return "indefinite", true
if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.IndefiniteDefault, "indefinite"); ok {
return artType, true
}
if lower == core.Lower(data.Articles.Definite) {
return "definite", true
if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.IndefiniteVowel, "indefinite"); ok {
return artType, true
}
if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.Definite, "definite"); ok {
return artType, true
}
for _, article := range data.Articles.ByGender {
if lower == core.Lower(article) {
return "definite", true
if artType, ok := matchConfiguredArticleCandidate(lower, article, "definite"); ok {
return artType, true
}
}
@ -725,16 +727,18 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b
if prefix == "" {
return "", false
}
if prefix == core.Lower(data.Articles.IndefiniteDefault) ||
prefix == core.Lower(data.Articles.IndefiniteVowel) {
return "indefinite", true
if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.IndefiniteDefault, "indefinite"); ok {
return artType, true
}
if prefix == core.Lower(data.Articles.Definite) {
return "definite", true
if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.IndefiniteVowel, "indefinite"); ok {
return artType, true
}
if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.Definite, "definite"); ok {
return artType, true
}
for _, article := range data.Articles.ByGender {
if prefix == core.Lower(article) {
return "definite", true
if artType, ok := matchConfiguredArticleCandidate(prefix, article, "definite"); ok {
return artType, true
}
}
}
@ -742,6 +746,34 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b
return "", false
}
func matchConfiguredArticleCandidate(lower, article, kind string) (string, bool) {
article = normalizeFrenchApostrophes(core.Lower(article))
if article == "" {
return "", false
}
if lower == article {
return kind, true
}
if !strings.HasPrefix(lower, article) {
return "", false
}
rest := strings.TrimPrefix(lower, article)
if rest == "" {
return "", false
}
if strings.HasSuffix(article, "'") {
return kind, true
}
r, _ := utf8.DecodeRuneInString(rest)
switch r {
case ' ', '\t', '\'', '', 'ʼ':
return kind, true
default:
return "", false
}
}
func matchFrenchLeadingArticlePhrase(lower string) (string, bool) {
lower = normalizeFrenchApostrophes(lower)
switch {
@ -911,6 +943,21 @@ func (t *Tokeniser) Tokenise(text string) []Token {
continue
}
}
if prefix, rest, ok := t.splitConfiguredElision(raw); ok {
if artType, ok := t.MatchArticle(prefix); ok {
tokens = append(tokens, Token{
Raw: prefix,
Lower: normalizeFrenchApostrophes(core.Lower(prefix)),
Type: TokenArticle,
ArtType: artType,
Confidence: 1.0,
})
}
raw = rest
if raw == "" {
continue
}
}
// Strip trailing punctuation to get the clean word.
word, punct := splitTrailingPunct(raw)
@ -1561,6 +1608,43 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
return "", raw, false
}
func (t *Tokeniser) splitConfiguredElision(raw string) (string, string, bool) {
if len(raw) == 0 {
return "", raw, false
}
data := i18n.GetGrammarData(t.lang)
if data == nil {
return "", raw, false
}
candidates := []string{data.Articles.IndefiniteDefault, data.Articles.IndefiniteVowel, data.Articles.Definite}
for _, article := range data.Articles.ByGender {
candidates = append(candidates, article)
}
lower := normalizeFrenchApostrophes(core.Lower(raw))
for _, article := range candidates {
article = normalizeFrenchApostrophes(core.Lower(article))
if article == "" || !strings.Contains(article, "'") {
continue
}
if !strings.HasPrefix(lower, article) {
continue
}
if len(raw) <= len(article) {
continue
}
rest := raw[len(article):]
if rest == "" {
continue
}
return raw[:len(article)], rest, true
}
return "", raw, false
}
func (t *Tokeniser) isFrenchLanguage() bool {
lang := core.Lower(t.lang)
return lang == "fr" || core.HasPrefix(lang, "fr-")

View file

@ -380,6 +380,66 @@ func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) {
}
}
func TestTokeniser_MatchArticle_ConfiguredElisionPrefix(t *testing.T) {
setup(t)
const lang = "xy"
prev := i18n.GetGrammarData(lang)
t.Cleanup(func() {
i18n.SetGrammarData(lang, prev)
})
i18n.SetGrammarData(lang, &i18n.GrammarData{
Articles: i18n.ArticleForms{
IndefiniteDefault: "a",
IndefiniteVowel: "an",
Definite: "l'",
ByGender: map[string]string{
"m": "le",
"f": "la",
},
},
Nouns: map[string]i18n.NounForms{
"ami": {One: "ami", Other: "amis", Gender: "m"},
},
})
tok := NewTokeniserForLang(lang)
tests := []struct {
word string
wantType string
wantOK bool
}{
{"l'ami", "definite", true},
{"lami", "definite", true},
{"lʼami", "definite", true},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
tokens := tok.Tokenise("l'ami")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'ami", len(tokens))
}
if tokens[0].Type != TokenArticle || tokens[0].ArtType != "definite" {
t.Fatalf("Tokenise(%q)[0] = %#v, want definite article", "l'ami", tokens[0])
}
if tokens[1].Type != TokenNoun || tokens[1].Lower != "ami" {
t.Fatalf("Tokenise(%q)[1] = %#v, want noun ami", "l'ami", tokens[1])
}
}
func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")