[agent/codex:gpt-5.4-mini] Read ~/spec/code/core/go/i18n/RFC.md fully. Find ONE feature... #24

Merged
Virgil merged 1 commit from agent/read---spec-code-core-go-i18n-rfc-md-ful into dev 2026-04-01 07:01:32 +00:00
3 changed files with 142 additions and 2 deletions

View file

@ -34,6 +34,21 @@ func TestNewImprint(t *testing.T) {
}
}
func TestNewImprint_WordPhrase(t *testing.T) {
svc, err := i18n.New()
if err != nil {
t.Fatalf("i18n.New() failed: %v", err)
}
i18n.SetDefault(svc)
tok := NewTokeniser()
imp := NewImprint(tok.Tokenise("up to date"))
if imp.DomainVocabulary["up_to_date"] != 1 {
t.Fatalf("DomainVocabulary[\"up_to_date\"] = %d, want 1", imp.DomainVocabulary["up_to_date"])
}
}
func TestNewImprint_Empty(t *testing.T) {
imp := NewImprint(nil)
if imp.TokenCount != 0 {

View file

@ -89,6 +89,7 @@ type Tokeniser struct {
pluralToBase map[string]string // "files" → "file"
baseNouns map[string]bool // "file" → true
words map[string]string // word translations
phraseLen int // longest multi-word gram.word entry
lang string
dualClass map[string]bool // words in both verb AND noun tables
@ -491,7 +492,11 @@ func (t *Tokeniser) buildWordIndex() {
// Map the key itself (already lowercase)
t.words[core.Lower(key)] = key
// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
t.words[core.Lower(display)] = key
lowerDisplay := core.Lower(display)
t.words[lowerDisplay] = key
if words := strings.Fields(lowerDisplay); len(words) > 1 && len(words) > t.phraseLen {
t.phraseLen = len(words)
}
}
}
@ -636,7 +641,17 @@ func (t *Tokeniser) Tokenise(text string) []Token {
var tokens []Token
// --- Pass 1: Classify & Mark ---
for _, raw := range parts {
for i := 0; i < len(parts); i++ {
if consumed, tok, punctTok := t.matchWordPhrase(parts, i); consumed > 0 {
tokens = append(tokens, tok)
if punctTok != nil {
tokens = append(tokens, *punctTok)
}
i += consumed - 1
continue
}
raw := parts[i]
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
if artType, ok := t.MatchArticle(prefix); ok {
tokens = append(tokens, Token{
@ -729,6 +744,83 @@ func (t *Tokeniser) Tokenise(text string) []Token {
return tokens
}
func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Token) {
if t.phraseLen < 2 || start >= len(parts) {
return 0, Token{}, nil
}
maxLen := t.phraseLen
if remaining := len(parts) - start; remaining < maxLen {
maxLen = remaining
}
for n := maxLen; n >= 2; n-- {
phraseWords := make([]string, 0, n)
rawParts := make([]string, 0, n)
var punct string
valid := true
for j := 0; j < n; j++ {
part := parts[start+j]
if prefix, _, ok := t.splitFrenchElision(part); ok && prefix != part {
valid = false
break
}
word, partPunct := splitTrailingPunct(part)
if word == "" {
valid = false
break
}
if partPunct != "" && j != n-1 {
valid = false
break
}
rawParts = append(rawParts, word)
phraseWords = append(phraseWords, core.Lower(word))
if j == n-1 {
punct = partPunct
}
}
if !valid {
continue
}
phrase := strings.Join(phraseWords, " ")
cat, ok := t.words[phrase]
if !ok {
continue
}
tok := Token{
Raw: strings.Join(rawParts, " "),
Lower: phrase,
Type: TokenWord,
WordCat: cat,
Confidence: 1.0,
}
if punct != "" {
if punctType, ok := matchPunctuation(punct); ok {
punctTok := Token{
Raw: punct,
Lower: punct,
Type: TokenPunctuation,
PunctType: punctType,
Confidence: 1.0,
}
return n, tok, &punctTok
}
}
return n, tok, nil
}
return 0, Token{}, nil
}
// resolveAmbiguous iterates all tokens and resolves any marked as
// tokenAmbiguous using the weighted scoring function.
func (t *Tokeniser) resolveAmbiguous(tokens []Token) {

View file

@ -159,6 +159,7 @@ func TestTokeniser_MatchWord(t *testing.T) {
{"url", "url", true},
{"ID", "id", true},
{"SSH", "ssh", true},
{"up to date", "up_to_date", true},
{"PHP", "php", true},
{"xyzzy", "", false},
}
@ -249,6 +250,38 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
}
}
func TestTokeniser_Tokenise_WordPhrase(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("up to date")
if len(tokens) != 1 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 1", "up to date", len(tokens))
}
if tokens[0].Type != TokenWord {
t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date", tokens[0].Type)
}
if tokens[0].WordCat != "up_to_date" {
t.Fatalf("Tokenise(%q)[0].WordCat = %q, want %q", "up to date", tokens[0].WordCat, "up_to_date")
}
}
func TestTokeniser_Tokenise_WordPhraseWithPunctuation(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("up to date.")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "up to date.", len(tokens))
}
if tokens[0].Type != TokenWord {
t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date.", tokens[0].Type)
}
if tokens[1].Type != TokenPunctuation {
t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenPunctuation", "up to date.", tokens[1].Type)
}
}
func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")