[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #65
2 changed files with 108 additions and 10 deletions
|
|
@ -617,7 +617,7 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
|
|||
}
|
||||
if t.isFrenchLanguage() {
|
||||
switch lower {
|
||||
case "l'", "l’", "d'", "d’", "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’", "les", "au", "aux", "du":
|
||||
case "l'", "l’", "d'", "d’", "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’", "de l'", "de l’", "les", "au", "aux", "du":
|
||||
return "definite", true
|
||||
case "un", "une", "des":
|
||||
return "indefinite", true
|
||||
|
|
@ -662,8 +662,11 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
i += consumed - 1
|
||||
continue
|
||||
}
|
||||
if consumed, tok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
|
||||
if consumed, tok, extraTok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
|
||||
tokens = append(tokens, tok)
|
||||
if extraTok != nil {
|
||||
tokens = append(tokens, *extraTok)
|
||||
}
|
||||
if punctTok != nil {
|
||||
tokens = append(tokens, *punctTok)
|
||||
}
|
||||
|
|
@ -841,24 +844,47 @@ func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Tok
|
|||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token) {
|
||||
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token, *Token) {
|
||||
if !t.isFrenchLanguage() || start+1 >= len(parts) {
|
||||
return 0, Token{}, nil
|
||||
return 0, Token{}, nil, nil
|
||||
}
|
||||
|
||||
first, firstPunct := splitTrailingPunct(parts[start])
|
||||
if first == "" || firstPunct != "" {
|
||||
return 0, Token{}, nil
|
||||
return 0, Token{}, nil, nil
|
||||
}
|
||||
second, secondPunct := splitTrailingPunct(parts[start+1])
|
||||
if second == "" {
|
||||
return 0, Token{}, nil
|
||||
return 0, Token{}, nil, nil
|
||||
}
|
||||
|
||||
switch core.Lower(first) {
|
||||
case "de":
|
||||
if core.Lower(second) != "la" {
|
||||
return 0, Token{}, nil
|
||||
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" {
|
||||
tok := Token{
|
||||
Raw: first + " " + prefix,
|
||||
Lower: core.Lower(first + " " + prefix),
|
||||
Type: TokenArticle,
|
||||
ArtType: "definite",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
extra := t.classifyElidedFrenchWord(rest)
|
||||
var punctTok *Token
|
||||
if secondPunct != "" {
|
||||
if punctType, ok := matchPunctuation(secondPunct); ok {
|
||||
punctTok = &Token{
|
||||
Raw: secondPunct,
|
||||
Lower: secondPunct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
return 2, tok, &extra, punctTok
|
||||
}
|
||||
return 0, Token{}, nil, nil
|
||||
}
|
||||
tok := Token{
|
||||
Raw: first + " " + second,
|
||||
|
|
@ -876,13 +902,66 @@ func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, To
|
|||
PunctType: punctType,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
return 2, tok, &punctTok
|
||||
return 2, tok, nil, &punctTok
|
||||
}
|
||||
}
|
||||
return 2, tok, nil
|
||||
return 2, tok, nil, nil
|
||||
}
|
||||
|
||||
return 0, Token{}, nil
|
||||
return 0, Token{}, nil, nil
|
||||
}
|
||||
|
||||
func (t *Tokeniser) classifyElidedFrenchWord(word string) Token {
|
||||
tok := Token{Raw: word, Lower: core.Lower(word)}
|
||||
|
||||
if artType, ok := t.MatchArticle(word); ok {
|
||||
tok.Type = TokenArticle
|
||||
tok.ArtType = artType
|
||||
tok.Confidence = 1.0
|
||||
return tok
|
||||
}
|
||||
|
||||
vm, verbOK := t.MatchVerb(word)
|
||||
nm, nounOK := t.MatchNoun(word)
|
||||
if verbOK && nounOK && t.dualClass[tok.Lower] {
|
||||
if vm.Tense != "base" {
|
||||
tok.Type = TokenVerb
|
||||
tok.VerbInfo = vm
|
||||
tok.NounInfo = nm
|
||||
tok.Confidence = 1.0
|
||||
} else if nm.Plural {
|
||||
tok.Type = TokenNoun
|
||||
tok.VerbInfo = vm
|
||||
tok.NounInfo = nm
|
||||
tok.Confidence = 1.0
|
||||
} else {
|
||||
tok.Type = tokenAmbiguous
|
||||
tok.VerbInfo = vm
|
||||
tok.NounInfo = nm
|
||||
}
|
||||
return tok
|
||||
}
|
||||
if verbOK {
|
||||
tok.Type = TokenVerb
|
||||
tok.VerbInfo = vm
|
||||
tok.Confidence = 1.0
|
||||
return tok
|
||||
}
|
||||
if nounOK {
|
||||
tok.Type = TokenNoun
|
||||
tok.NounInfo = nm
|
||||
tok.Confidence = 1.0
|
||||
return tok
|
||||
}
|
||||
if cat, ok := t.MatchWord(word); ok {
|
||||
tok.Type = TokenWord
|
||||
tok.WordCat = cat
|
||||
tok.Confidence = 1.0
|
||||
return tok
|
||||
}
|
||||
|
||||
tok.Type = TokenUnknown
|
||||
return tok
|
||||
}
|
||||
|
||||
// resolveAmbiguous iterates all tokens and resolves any marked as
|
||||
|
|
|
|||
|
|
@ -220,6 +220,8 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
|
|||
{"la", "definite", true},
|
||||
{"Le", "definite", true},
|
||||
{"La", "definite", true},
|
||||
{"de l'", "definite", true},
|
||||
{"de l’", "definite", true},
|
||||
{"un", "indefinite", true},
|
||||
{"une", "indefinite", true},
|
||||
}
|
||||
|
|
@ -337,6 +339,23 @@ func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
|||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("de l'enfant")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l'enfant", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].Lower != "de l'" {
|
||||
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "enfant" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
|
||||
tokens = tok.Tokenise("d'enfant")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "d'enfant", len(tokens))
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue