[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #65

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-02 00:37:06 +00:00
2 changed files with 108 additions and 10 deletions

View file

@ -617,7 +617,7 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
}
if t.isFrenchLanguage() {
switch lower {
case "l'", "l", "d'", "d", "j'", "j", "m'", "m", "t'", "t", "s'", "s", "n'", "n", "c'", "c", "qu'", "qu", "les", "au", "aux", "du":
case "l'", "l", "d'", "d", "j'", "j", "m'", "m", "t'", "t", "s'", "s", "n'", "n", "c'", "c", "qu'", "qu", "de l'", "de l", "les", "au", "aux", "du":
return "definite", true
case "un", "une", "des":
return "indefinite", true
@ -662,8 +662,11 @@ func (t *Tokeniser) Tokenise(text string) []Token {
i += consumed - 1
continue
}
if consumed, tok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
if consumed, tok, extraTok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
tokens = append(tokens, tok)
if extraTok != nil {
tokens = append(tokens, *extraTok)
}
if punctTok != nil {
tokens = append(tokens, *punctTok)
}
@ -841,24 +844,47 @@ func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Tok
return 0, Token{}, nil
}
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token) {
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token, *Token) {
if !t.isFrenchLanguage() || start+1 >= len(parts) {
return 0, Token{}, nil
return 0, Token{}, nil, nil
}
first, firstPunct := splitTrailingPunct(parts[start])
if first == "" || firstPunct != "" {
return 0, Token{}, nil
return 0, Token{}, nil, nil
}
second, secondPunct := splitTrailingPunct(parts[start+1])
if second == "" {
return 0, Token{}, nil
return 0, Token{}, nil, nil
}
switch core.Lower(first) {
case "de":
if core.Lower(second) != "la" {
return 0, Token{}, nil
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l") && rest != "" {
tok := Token{
Raw: first + " " + prefix,
Lower: core.Lower(first + " " + prefix),
Type: TokenArticle,
ArtType: "definite",
Confidence: 1.0,
}
extra := t.classifyElidedFrenchWord(rest)
var punctTok *Token
if secondPunct != "" {
if punctType, ok := matchPunctuation(secondPunct); ok {
punctTok = &Token{
Raw: secondPunct,
Lower: secondPunct,
Type: TokenPunctuation,
PunctType: punctType,
Confidence: 1.0,
}
}
}
return 2, tok, &extra, punctTok
}
return 0, Token{}, nil, nil
}
tok := Token{
Raw: first + " " + second,
@ -876,13 +902,66 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
PunctType: punctType,
Confidence: 1.0,
}
return 2, tok, &punctTok
return 2, tok, nil, &punctTok
}
}
return 2, tok, nil
return 2, tok, nil, nil
}
return 0, Token{}, nil
return 0, Token{}, nil, nil
}
func (t *Tokeniser) classifyElidedFrenchWord(word string) Token {
tok := Token{Raw: word, Lower: core.Lower(word)}
if artType, ok := t.MatchArticle(word); ok {
tok.Type = TokenArticle
tok.ArtType = artType
tok.Confidence = 1.0
return tok
}
vm, verbOK := t.MatchVerb(word)
nm, nounOK := t.MatchNoun(word)
if verbOK && nounOK && t.dualClass[tok.Lower] {
if vm.Tense != "base" {
tok.Type = TokenVerb
tok.VerbInfo = vm
tok.NounInfo = nm
tok.Confidence = 1.0
} else if nm.Plural {
tok.Type = TokenNoun
tok.VerbInfo = vm
tok.NounInfo = nm
tok.Confidence = 1.0
} else {
tok.Type = tokenAmbiguous
tok.VerbInfo = vm
tok.NounInfo = nm
}
return tok
}
if verbOK {
tok.Type = TokenVerb
tok.VerbInfo = vm
tok.Confidence = 1.0
return tok
}
if nounOK {
tok.Type = TokenNoun
tok.NounInfo = nm
tok.Confidence = 1.0
return tok
}
if cat, ok := t.MatchWord(word); ok {
tok.Type = TokenWord
tok.WordCat = cat
tok.Confidence = 1.0
return tok
}
tok.Type = TokenUnknown
return tok
}
// resolveAmbiguous iterates all tokens and resolves any marked as

View file

@ -220,6 +220,8 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
{"la", "definite", true},
{"Le", "definite", true},
{"La", "definite", true},
{"de l'", "definite", true},
{"de l", "definite", true},
{"un", "indefinite", true},
{"une", "indefinite", true},
}
@ -337,6 +339,23 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("de l'enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l'enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("d'enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "d'enfant", len(tokens))