[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #16
2 changed files with 93 additions and 0 deletions
|
|
@ -598,6 +598,14 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
|
|||
return "definite", true
|
||||
}
|
||||
}
|
||||
if t.isFrenchLanguage() {
|
||||
switch lower {
|
||||
case "l'", "les":
|
||||
return "definite", true
|
||||
case "des":
|
||||
return "indefinite", true
|
||||
}
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
|
@ -629,6 +637,22 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
|
||||
// --- Pass 1: Classify & Mark ---
|
||||
for _, raw := range parts {
|
||||
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
|
||||
if artType, ok := t.MatchArticle(prefix); ok {
|
||||
tokens = append(tokens, Token{
|
||||
Raw: prefix,
|
||||
Lower: core.Lower(prefix),
|
||||
Type: TokenArticle,
|
||||
ArtType: artType,
|
||||
Confidence: 1.0,
|
||||
})
|
||||
}
|
||||
raw = rest
|
||||
if raw == "" {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Strip trailing punctuation to get the clean word.
|
||||
word, punct := splitTrailingPunct(raw)
|
||||
|
||||
|
|
@ -962,6 +986,24 @@ func splitTrailingPunct(s string) (string, string) {
|
|||
return s, ""
|
||||
}
|
||||
|
||||
func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
|
||||
if !t.isFrenchLanguage() || len(raw) <= 2 {
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
lower := core.Lower(raw)
|
||||
if len(lower) > 2 && lower[0] == 'l' && lower[1] == '\'' {
|
||||
return raw[:2], raw[2:], true
|
||||
}
|
||||
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
func (t *Tokeniser) isFrenchLanguage() bool {
|
||||
lang := core.Lower(t.lang)
|
||||
return lang == "fr" || core.HasPrefix(lang, "fr-")
|
||||
}
|
||||
|
||||
// matchPunctuation detects known punctuation patterns.
|
||||
// Returns the punctuation type and true if recognised.
|
||||
func matchPunctuation(punct string) (string, bool) {
|
||||
|
|
|
|||
|
|
@ -241,6 +241,57 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniserForLang("fr")
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantType string
|
||||
wantOK bool
|
||||
}{
|
||||
{"l'", "definite", true},
|
||||
{"L'", "definite", true},
|
||||
{"les", "definite", true},
|
||||
{"des", "indefinite", true},
|
||||
{"l'enfant", "", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
artType, ok := tok.MatchArticle(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if ok && artType != tt.wantType {
|
||||
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniserForLang("fr")
|
||||
|
||||
tokens := tok.Tokenise("l'enfant")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'enfant", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle {
|
||||
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].ArtType != "definite" {
|
||||
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
|
||||
}
|
||||
if tokens[1].Type != TokenNoun {
|
||||
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
|
||||
}
|
||||
if tokens[1].Lower != "enfant" {
|
||||
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue