From a4fde16998f739fd5e22d7261473e7be04f4f269 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 16 Feb 2026 02:02:55 +0000 Subject: [PATCH] fix: handle both string and array merge formats in tokenizer Gemma 3 tokenizer.json uses [["a","b"],...] format for merges instead of the ["a b",...] format. Support both. Co-Authored-By: Claude Opus 4.6 --- pkg/mlx/tokenizer/tokenizer.go | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/pkg/mlx/tokenizer/tokenizer.go b/pkg/mlx/tokenizer/tokenizer.go index 4a1258a9..9dd9450c 100644 --- a/pkg/mlx/tokenizer/tokenizer.go +++ b/pkg/mlx/tokenizer/tokenizer.go @@ -29,10 +29,10 @@ type mergePair struct { // tokenizerJSON is the HuggingFace tokenizer.json format. type tokenizerJSON struct { Model struct { - Type string `json:"type"` - Vocab json.RawMessage `json:"vocab"` - Merges []string `json:"merges"` - ByteFallback bool `json:"byte_fallback"` + Type string `json:"type"` + Vocab json.RawMessage `json:"vocab"` + Merges json.RawMessage `json:"merges"` + ByteFallback bool `json:"byte_fallback"` } `json:"model"` AddedTokens []struct { ID int32 `json:"id"` @@ -69,11 +69,27 @@ func Load(path string) (*Tokenizer, error) { t.invVocab[v] = k } - // Parse merges - for rank, merge := range tj.Model.Merges { - parts := strings.SplitN(merge, " ", 2) - if len(parts) == 2 { - t.merges = append(t.merges, mergePair{a: parts[0], b: parts[1], rank: rank}) + // Parse merges — supports both ["a b", ...] and [["a","b"], ...] formats + if len(tj.Model.Merges) > 0 { + // Try array-of-strings first + var stringMerges []string + if err := json.Unmarshal(tj.Model.Merges, &stringMerges); err == nil { + for rank, merge := range stringMerges { + parts := strings.SplitN(merge, " ", 2) + if len(parts) == 2 { + t.merges = append(t.merges, mergePair{a: parts[0], b: parts[1], rank: rank}) + } + } + } else { + // Try array-of-arrays: [["a","b"], ...] + var arrayMerges [][]string + if err := json.Unmarshal(tj.Model.Merges, &arrayMerges); err == nil { + for rank, pair := range arrayMerges { + if len(pair) == 2 { + t.merges = append(t.merges, mergePair{a: pair[0], b: pair[1], rank: rank}) + } + } + } } }