[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #125
2 changed files with 158 additions and 14 deletions
|
|
@ -707,16 +707,18 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b
|
|||
}
|
||||
lower = normalizeFrenchApostrophes(lower)
|
||||
|
||||
if lower == core.Lower(data.Articles.IndefiniteDefault) ||
|
||||
lower == core.Lower(data.Articles.IndefiniteVowel) {
|
||||
return "indefinite", true
|
||||
if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.IndefiniteDefault, "indefinite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
if lower == core.Lower(data.Articles.Definite) {
|
||||
return "definite", true
|
||||
if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.IndefiniteVowel, "indefinite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
if artType, ok := matchConfiguredArticleCandidate(lower, data.Articles.Definite, "definite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
for _, article := range data.Articles.ByGender {
|
||||
if lower == core.Lower(article) {
|
||||
return "definite", true
|
||||
if artType, ok := matchConfiguredArticleCandidate(lower, article, "definite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -725,16 +727,18 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b
|
|||
if prefix == "" {
|
||||
return "", false
|
||||
}
|
||||
if prefix == core.Lower(data.Articles.IndefiniteDefault) ||
|
||||
prefix == core.Lower(data.Articles.IndefiniteVowel) {
|
||||
return "indefinite", true
|
||||
if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.IndefiniteDefault, "indefinite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
if prefix == core.Lower(data.Articles.Definite) {
|
||||
return "definite", true
|
||||
if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.IndefiniteVowel, "indefinite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
if artType, ok := matchConfiguredArticleCandidate(prefix, data.Articles.Definite, "definite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
for _, article := range data.Articles.ByGender {
|
||||
if prefix == core.Lower(article) {
|
||||
return "definite", true
|
||||
if artType, ok := matchConfiguredArticleCandidate(prefix, article, "definite"); ok {
|
||||
return artType, true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -742,6 +746,34 @@ func matchConfiguredArticleText(lower string, data *i18n.GrammarData) (string, b
|
|||
return "", false
|
||||
}
|
||||
|
||||
func matchConfiguredArticleCandidate(lower, article, kind string) (string, bool) {
|
||||
article = normalizeFrenchApostrophes(core.Lower(article))
|
||||
if article == "" {
|
||||
return "", false
|
||||
}
|
||||
if lower == article {
|
||||
return kind, true
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(lower, article) {
|
||||
return "", false
|
||||
}
|
||||
rest := strings.TrimPrefix(lower, article)
|
||||
if rest == "" {
|
||||
return "", false
|
||||
}
|
||||
if strings.HasSuffix(article, "'") {
|
||||
return kind, true
|
||||
}
|
||||
r, _ := utf8.DecodeRuneInString(rest)
|
||||
switch r {
|
||||
case ' ', '\t', '\'', '’', 'ʼ':
|
||||
return kind, true
|
||||
default:
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
|
||||
func matchFrenchLeadingArticlePhrase(lower string) (string, bool) {
|
||||
lower = normalizeFrenchApostrophes(lower)
|
||||
switch {
|
||||
|
|
@ -911,6 +943,21 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
continue
|
||||
}
|
||||
}
|
||||
if prefix, rest, ok := t.splitConfiguredElision(raw); ok {
|
||||
if artType, ok := t.MatchArticle(prefix); ok {
|
||||
tokens = append(tokens, Token{
|
||||
Raw: prefix,
|
||||
Lower: normalizeFrenchApostrophes(core.Lower(prefix)),
|
||||
Type: TokenArticle,
|
||||
ArtType: artType,
|
||||
Confidence: 1.0,
|
||||
})
|
||||
}
|
||||
raw = rest
|
||||
if raw == "" {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Strip trailing punctuation to get the clean word.
|
||||
word, punct := splitTrailingPunct(raw)
|
||||
|
|
@ -1561,6 +1608,43 @@ func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
|
|||
return "", raw, false
|
||||
}
|
||||
|
||||
func (t *Tokeniser) splitConfiguredElision(raw string) (string, string, bool) {
|
||||
if len(raw) == 0 {
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data == nil {
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
candidates := []string{data.Articles.IndefiniteDefault, data.Articles.IndefiniteVowel, data.Articles.Definite}
|
||||
for _, article := range data.Articles.ByGender {
|
||||
candidates = append(candidates, article)
|
||||
}
|
||||
|
||||
lower := normalizeFrenchApostrophes(core.Lower(raw))
|
||||
for _, article := range candidates {
|
||||
article = normalizeFrenchApostrophes(core.Lower(article))
|
||||
if article == "" || !strings.Contains(article, "'") {
|
||||
continue
|
||||
}
|
||||
if !strings.HasPrefix(lower, article) {
|
||||
continue
|
||||
}
|
||||
if len(raw) <= len(article) {
|
||||
continue
|
||||
}
|
||||
rest := raw[len(article):]
|
||||
if rest == "" {
|
||||
continue
|
||||
}
|
||||
return raw[:len(article)], rest, true
|
||||
}
|
||||
|
||||
return "", raw, false
|
||||
}
|
||||
|
||||
func (t *Tokeniser) isFrenchLanguage() bool {
|
||||
lang := core.Lower(t.lang)
|
||||
return lang == "fr" || core.HasPrefix(lang, "fr-")
|
||||
|
|
|
|||
|
|
@ -380,6 +380,66 @@ func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchArticle_ConfiguredElisionPrefix(t *testing.T) {
|
||||
setup(t)
|
||||
|
||||
const lang = "xy"
|
||||
prev := i18n.GetGrammarData(lang)
|
||||
t.Cleanup(func() {
|
||||
i18n.SetGrammarData(lang, prev)
|
||||
})
|
||||
|
||||
i18n.SetGrammarData(lang, &i18n.GrammarData{
|
||||
Articles: i18n.ArticleForms{
|
||||
IndefiniteDefault: "a",
|
||||
IndefiniteVowel: "an",
|
||||
Definite: "l'",
|
||||
ByGender: map[string]string{
|
||||
"m": "le",
|
||||
"f": "la",
|
||||
},
|
||||
},
|
||||
Nouns: map[string]i18n.NounForms{
|
||||
"ami": {One: "ami", Other: "amis", Gender: "m"},
|
||||
},
|
||||
})
|
||||
|
||||
tok := NewTokeniserForLang(lang)
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantType string
|
||||
wantOK bool
|
||||
}{
|
||||
{"l'ami", "definite", true},
|
||||
{"l’ami", "definite", true},
|
||||
{"lʼami", "definite", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
artType, ok := tok.MatchArticle(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if ok && artType != tt.wantType {
|
||||
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
tokens := tok.Tokenise("l'ami")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'ami", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenArticle || tokens[0].ArtType != "definite" {
|
||||
t.Fatalf("Tokenise(%q)[0] = %#v, want definite article", "l'ami", tokens[0])
|
||||
}
|
||||
if tokens[1].Type != TokenNoun || tokens[1].Lower != "ami" {
|
||||
t.Fatalf("Tokenise(%q)[1] = %#v, want noun ami", "l'ami", tokens[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniserForLang("fr")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue