fix: handle both string and array merge formats in tokenizer

Gemma 3 tokenizer.json uses [["a","b"],...] format for merges
instead of the ["a b",...] format. Support both.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-02-16 02:02:55 +00:00 committed by Snider
parent 4d3e54c81a
commit 7fc1571f93

View file

@ -31,7 +31,7 @@ type tokenizerJSON struct {
Model struct { Model struct {
Type string `json:"type"` Type string `json:"type"`
Vocab json.RawMessage `json:"vocab"` Vocab json.RawMessage `json:"vocab"`
Merges []string `json:"merges"` Merges json.RawMessage `json:"merges"`
ByteFallback bool `json:"byte_fallback"` ByteFallback bool `json:"byte_fallback"`
} `json:"model"` } `json:"model"`
AddedTokens []struct { AddedTokens []struct {
@ -69,13 +69,29 @@ func Load(path string) (*Tokenizer, error) {
t.invVocab[v] = k t.invVocab[v] = k
} }
// Parse merges // Parse merges — supports both ["a b", ...] and [["a","b"], ...] formats
for rank, merge := range tj.Model.Merges { if len(tj.Model.Merges) > 0 {
// Try array-of-strings first
var stringMerges []string
if err := json.Unmarshal(tj.Model.Merges, &stringMerges); err == nil {
for rank, merge := range stringMerges {
parts := strings.SplitN(merge, " ", 2) parts := strings.SplitN(merge, " ", 2)
if len(parts) == 2 { if len(parts) == 2 {
t.merges = append(t.merges, mergePair{a: parts[0], b: parts[1], rank: rank}) t.merges = append(t.merges, mergePair{a: parts[0], b: parts[1], rank: rank})
} }
} }
} else {
// Try array-of-arrays: [["a","b"], ...]
var arrayMerges [][]string
if err := json.Unmarshal(tj.Model.Merges, &arrayMerges); err == nil {
for rank, pair := range arrayMerges {
if len(pair) == 2 {
t.merges = append(t.merges, mergePair{a: pair[0], b: pair[1], rank: rank})
}
}
}
}
}
// Parse special tokens // Parse special tokens
for _, tok := range tj.AddedTokens { for _, tok := range tj.AddedTokens {