[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #61

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-02 00:20:21 +00:00
2 changed files with 77 additions and 0 deletions

View file

@ -662,6 +662,14 @@ func (t *Tokeniser) Tokenise(text string) []Token {
i += consumed - 1
continue
}
if consumed, tok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
tokens = append(tokens, tok)
if punctTok != nil {
tokens = append(tokens, *punctTok)
}
i += consumed - 1
continue
}
raw := parts[i]
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
@ -833,6 +841,50 @@ func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Tok
return 0, Token{}, nil
}
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token) {
if !t.isFrenchLanguage() || start+1 >= len(parts) {
return 0, Token{}, nil
}
first, firstPunct := splitTrailingPunct(parts[start])
if first == "" || firstPunct != "" {
return 0, Token{}, nil
}
second, secondPunct := splitTrailingPunct(parts[start+1])
if second == "" {
return 0, Token{}, nil
}
switch core.Lower(first) {
case "de":
if core.Lower(second) != "la" {
return 0, Token{}, nil
}
tok := Token{
Raw: first + " " + second,
Lower: "de la",
Type: TokenArticle,
ArtType: "definite",
Confidence: 1.0,
}
if secondPunct != "" {
if punctType, ok := matchPunctuation(secondPunct); ok {
punctTok := Token{
Raw: secondPunct,
Lower: secondPunct,
Type: TokenPunctuation,
PunctType: punctType,
Confidence: 1.0,
}
return 2, tok, &punctTok
}
}
return 2, tok, nil
}
return 0, Token{}, nil
}
// resolveAmbiguous iterates all tokens and resolves any marked as
// tokenAmbiguous using the weighted scoring function.
func (t *Tokeniser) resolveAmbiguous(tokens []Token) {

View file

@ -374,6 +374,31 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
}
}
func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tokens := tok.Tokenise("de la branche")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de la branche", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de la" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la")
}
if tokens[0].ArtType != "definite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "branche" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
}
}
func TestTokeniser_Tokenise(t *testing.T) {
setup(t)
tok := NewTokeniser()