fix: handle both string and array merge formats in tokenizer
Gemma 3 tokenizer.json uses [["a","b"],...] format for merges instead of the ["a b",...] format. Support both. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4d3e54c81a
commit
7fc1571f93
1 changed files with 25 additions and 9 deletions
|
|
@ -31,7 +31,7 @@ type tokenizerJSON struct {
|
||||||
Model struct {
|
Model struct {
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Vocab json.RawMessage `json:"vocab"`
|
Vocab json.RawMessage `json:"vocab"`
|
||||||
Merges []string `json:"merges"`
|
Merges json.RawMessage `json:"merges"`
|
||||||
ByteFallback bool `json:"byte_fallback"`
|
ByteFallback bool `json:"byte_fallback"`
|
||||||
} `json:"model"`
|
} `json:"model"`
|
||||||
AddedTokens []struct {
|
AddedTokens []struct {
|
||||||
|
|
@ -69,13 +69,29 @@ func Load(path string) (*Tokenizer, error) {
|
||||||
t.invVocab[v] = k
|
t.invVocab[v] = k
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse merges
|
// Parse merges — supports both ["a b", ...] and [["a","b"], ...] formats
|
||||||
for rank, merge := range tj.Model.Merges {
|
if len(tj.Model.Merges) > 0 {
|
||||||
|
// Try array-of-strings first
|
||||||
|
var stringMerges []string
|
||||||
|
if err := json.Unmarshal(tj.Model.Merges, &stringMerges); err == nil {
|
||||||
|
for rank, merge := range stringMerges {
|
||||||
parts := strings.SplitN(merge, " ", 2)
|
parts := strings.SplitN(merge, " ", 2)
|
||||||
if len(parts) == 2 {
|
if len(parts) == 2 {
|
||||||
t.merges = append(t.merges, mergePair{a: parts[0], b: parts[1], rank: rank})
|
t.merges = append(t.merges, mergePair{a: parts[0], b: parts[1], rank: rank})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Try array-of-arrays: [["a","b"], ...]
|
||||||
|
var arrayMerges [][]string
|
||||||
|
if err := json.Unmarshal(tj.Model.Merges, &arrayMerges); err == nil {
|
||||||
|
for rank, pair := range arrayMerges {
|
||||||
|
if len(pair) == 2 {
|
||||||
|
t.merges = append(t.merges, mergePair{a: pair[0], b: pair[1], rank: rank})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Parse special tokens
|
// Parse special tokens
|
||||||
for _, tok := range tj.AddedTokens {
|
for _, tok := range tj.AddedTokens {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue