[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #24
3 changed files with 142 additions and 2 deletions
|
|
@ -34,6 +34,21 @@ func TestNewImprint(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestNewImprint_WordPhrase(t *testing.T) {
|
||||
svc, err := i18n.New()
|
||||
if err != nil {
|
||||
t.Fatalf("i18n.New() failed: %v", err)
|
||||
}
|
||||
i18n.SetDefault(svc)
|
||||
|
||||
tok := NewTokeniser()
|
||||
imp := NewImprint(tok.Tokenise("up to date"))
|
||||
|
||||
if imp.DomainVocabulary["up_to_date"] != 1 {
|
||||
t.Fatalf("DomainVocabulary[\"up_to_date\"] = %d, want 1", imp.DomainVocabulary["up_to_date"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewImprint_Empty(t *testing.T) {
|
||||
imp := NewImprint(nil)
|
||||
if imp.TokenCount != 0 {
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ type Tokeniser struct {
|
|||
pluralToBase map[string]string // "files" → "file"
|
||||
baseNouns map[string]bool // "file" → true
|
||||
words map[string]string // word translations
|
||||
phraseLen int // longest multi-word gram.word entry
|
||||
lang string
|
||||
|
||||
dualClass map[string]bool // words in both verb AND noun tables
|
||||
|
|
@ -491,7 +492,11 @@ func (t *Tokeniser) buildWordIndex() {
|
|||
// Map the key itself (already lowercase)
|
||||
t.words[core.Lower(key)] = key
|
||||
// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
|
||||
t.words[core.Lower(display)] = key
|
||||
lowerDisplay := core.Lower(display)
|
||||
t.words[lowerDisplay] = key
|
||||
if words := strings.Fields(lowerDisplay); len(words) > 1 && len(words) > t.phraseLen {
|
||||
t.phraseLen = len(words)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -636,7 +641,17 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
var tokens []Token
|
||||
|
||||
// --- Pass 1: Classify & Mark ---
|
||||
for _, raw := range parts {
|
||||
for i := 0; i < len(parts); i++ {
|
||||
if consumed, tok, punctTok := t.matchWordPhrase(parts, i); consumed > 0 {
|
||||
tokens = append(tokens, tok)
|
||||
if punctTok != nil {
|
||||
tokens = append(tokens, *punctTok)
|
||||
}
|
||||
i += consumed - 1
|
||||
continue
|
||||
}
|
||||
|
||||
raw := parts[i]
|
||||
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
|
||||
if artType, ok := t.MatchArticle(prefix); ok {
|
||||
tokens = append(tokens, Token{
|
||||
|
|
@ -729,6 +744,83 @@ func (t *Tokeniser) Tokenise(text string) []Token {
|
|||
return tokens
|
||||
}
|
||||
|
||||
func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Token) {
|
||||
if t.phraseLen < 2 || start >= len(parts) {
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
maxLen := t.phraseLen
|
||||
if remaining := len(parts) - start; remaining < maxLen {
|
||||
maxLen = remaining
|
||||
}
|
||||
|
||||
for n := maxLen; n >= 2; n-- {
|
||||
phraseWords := make([]string, 0, n)
|
||||
rawParts := make([]string, 0, n)
|
||||
var punct string
|
||||
valid := true
|
||||
|
||||
for j := 0; j < n; j++ {
|
||||
part := parts[start+j]
|
||||
if prefix, _, ok := t.splitFrenchElision(part); ok && prefix != part {
|
||||
valid = false
|
||||
break
|
||||
}
|
||||
|
||||
word, partPunct := splitTrailingPunct(part)
|
||||
if word == "" {
|
||||
valid = false
|
||||
break
|
||||
}
|
||||
if partPunct != "" && j != n-1 {
|
||||
valid = false
|
||||
break
|
||||
}
|
||||
|
||||
rawParts = append(rawParts, word)
|
||||
phraseWords = append(phraseWords, core.Lower(word))
|
||||
if j == n-1 {
|
||||
punct = partPunct
|
||||
}
|
||||
}
|
||||
|
||||
if !valid {
|
||||
continue
|
||||
}
|
||||
|
||||
phrase := strings.Join(phraseWords, " ")
|
||||
cat, ok := t.words[phrase]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
tok := Token{
|
||||
Raw: strings.Join(rawParts, " "),
|
||||
Lower: phrase,
|
||||
Type: TokenWord,
|
||||
WordCat: cat,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
if punct != "" {
|
||||
if punctType, ok := matchPunctuation(punct); ok {
|
||||
punctTok := Token{
|
||||
Raw: punct,
|
||||
Lower: punct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
Confidence: 1.0,
|
||||
}
|
||||
return n, tok, &punctTok
|
||||
}
|
||||
}
|
||||
|
||||
return n, tok, nil
|
||||
}
|
||||
|
||||
return 0, Token{}, nil
|
||||
}
|
||||
|
||||
// resolveAmbiguous iterates all tokens and resolves any marked as
|
||||
// tokenAmbiguous using the weighted scoring function.
|
||||
func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
|
||||
|
|
|
|||
|
|
@ -159,6 +159,7 @@ func TestTokeniser_MatchWord(t *testing.T) {
|
|||
{"url", "url", true},
|
||||
{"ID", "id", true},
|
||||
{"SSH", "ssh", true},
|
||||
{"up to date", "up_to_date", true},
|
||||
{"PHP", "php", true},
|
||||
{"xyzzy", "", false},
|
||||
}
|
||||
|
|
@ -249,6 +250,38 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_WordPhrase(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("up to date")
|
||||
if len(tokens) != 1 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 1", "up to date", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenWord {
|
||||
t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].WordCat != "up_to_date" {
|
||||
t.Fatalf("Tokenise(%q)[0].WordCat = %q, want %q", "up to date", tokens[0].WordCat, "up_to_date")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_WordPhraseWithPunctuation(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("up to date.")
|
||||
if len(tokens) != 2 {
|
||||
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "up to date.", len(tokens))
|
||||
}
|
||||
if tokens[0].Type != TokenWord {
|
||||
t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date.", tokens[0].Type)
|
||||
}
|
||||
if tokens[1].Type != TokenPunctuation {
|
||||
t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenPunctuation", "up to date.", tokens[1].Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniserForLang("fr")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue