[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #61
2 changed files with 77 additions and 0 deletions
|
|
@ -662,6 +662,14 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
i += consumed - 1
|
||||
continue
|
||||
}
|
||||
if consumed, tok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
|
||||
tokens = append(tokens, tok)
|
||||
if punctTok != nil {
|
||||
tokens = append(tokens, *punctTok)
|
||||
}
|
||||
i += consumed - 1
|
||||
continue
|
||||
}
|
||||
|
||||
raw := parts[i]
|
||||
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
|
||||
|
|
@ -833,6 +841,50 @@ func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Tok
|
|||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token) {
|
||||
if !t.isFrenchLanguage() || start+1 >= len(parts) {
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
first, firstPunct := splitTrailingPunct(parts[start])
|
||||
if first == "" || firstPunct != "" {
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
second, secondPunct := splitTrailingPunct(parts[start+1])
|
||||
if second == "" {
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
switch core.Lower(first) {
|
||||
case "de":
|
||||
if core.Lower(second) != "la" {
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
tok := Token{
|
||||
Raw: first + " " + second,
|
||||
Lower: "de la",
|
||||
Type: TokenArticle,
|
||||
ArtType: "definite",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
if secondPunct != "" {
|
||||
if punctType, ok := matchPunctuation(secondPunct); ok {
|
||||
punctTok := Token{
|
||||
Raw: secondPunct,
|
||||
Lower: secondPunct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
return 2, tok, &punctTok
|
||||
}
|
||||
}
|
||||
return 2, tok, nil
|
||||
}
|
||||
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
// resolveAmbiguous iterates all tokens and resolves any marked as
|
||||
// tokenAmbiguous using the weighted scoring function.
|
||||
func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
|
||||
|
|
|
|||
|
|
@ -374,6 +374,31 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniserForLang("fr")
|
||||
|
||||
tokens := tok.Tokenise("de la branche")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de la branche", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].Lower != "de la" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la")
|
||||
}
|
||||
if tokens[0].ArtType != "definite" {
|
||||
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "branche" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue