Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
121 lines
3.8 KiB
Go
121 lines
3.8 KiB
Go
package inference
|
|
|
|
// GenerateConfig holds generation parameters.
|
|
type GenerateConfig struct {
|
|
MaxTokens int
|
|
Temperature float32
|
|
TopK int
|
|
TopP float32
|
|
StopTokens []int32
|
|
RepeatPenalty float32
|
|
ReturnLogits bool // Return raw logits in ClassifyResult (default false)
|
|
}
|
|
|
|
// DefaultGenerateConfig returns sensible defaults.
|
|
func DefaultGenerateConfig() GenerateConfig {
|
|
return GenerateConfig{
|
|
MaxTokens: 256,
|
|
Temperature: 0.0, // greedy
|
|
}
|
|
}
|
|
|
|
// GenerateOption configures text generation.
|
|
type GenerateOption func(*GenerateConfig)
|
|
|
|
// WithMaxTokens sets the maximum number of tokens to generate.
|
|
func WithMaxTokens(n int) GenerateOption {
|
|
return func(c *GenerateConfig) { c.MaxTokens = n }
|
|
}
|
|
|
|
// WithTemperature sets the sampling temperature. 0 = greedy.
|
|
func WithTemperature(t float32) GenerateOption {
|
|
return func(c *GenerateConfig) { c.Temperature = t }
|
|
}
|
|
|
|
// WithTopK sets top-k sampling. 0 = disabled.
|
|
func WithTopK(k int) GenerateOption {
|
|
return func(c *GenerateConfig) { c.TopK = k }
|
|
}
|
|
|
|
// WithTopP sets nucleus sampling threshold. 0 = disabled.
|
|
func WithTopP(p float32) GenerateOption {
|
|
return func(c *GenerateConfig) { c.TopP = p }
|
|
}
|
|
|
|
// WithStopTokens sets token IDs that stop generation.
|
|
func WithStopTokens(ids ...int32) GenerateOption {
|
|
return func(c *GenerateConfig) { c.StopTokens = ids }
|
|
}
|
|
|
|
// WithRepeatPenalty sets the repetition penalty. 0 = disabled, 1.0 = no penalty.
|
|
func WithRepeatPenalty(p float32) GenerateOption {
|
|
return func(c *GenerateConfig) { c.RepeatPenalty = p }
|
|
}
|
|
|
|
// WithLogits requests raw logits in ClassifyResult. Off by default to save memory.
|
|
func WithLogits() GenerateOption {
|
|
return func(c *GenerateConfig) { c.ReturnLogits = true }
|
|
}
|
|
|
|
// ApplyGenerateOpts builds a GenerateConfig from options.
|
|
func ApplyGenerateOpts(opts []GenerateOption) GenerateConfig {
|
|
cfg := DefaultGenerateConfig()
|
|
for _, o := range opts {
|
|
o(&cfg)
|
|
}
|
|
return cfg
|
|
}
|
|
|
|
// LoadConfig holds model loading parameters.
|
|
type LoadConfig struct {
|
|
Backend string // "metal", "rocm", "llama_cpp" (empty = auto-detect)
|
|
ContextLen int // Context window size (0 = model default)
|
|
GPULayers int // Number of layers to offload to GPU (-1 = all, 0 = none)
|
|
ParallelSlots int // Number of concurrent inference slots (0 = server default)
|
|
AdapterPath string // Path to LoRA adapter directory (empty = no adapter)
|
|
}
|
|
|
|
// LoadOption configures model loading.
|
|
type LoadOption func(*LoadConfig)
|
|
|
|
// WithBackend selects a specific inference backend by name.
|
|
func WithBackend(name string) LoadOption {
|
|
return func(c *LoadConfig) { c.Backend = name }
|
|
}
|
|
|
|
// WithContextLen sets the context window size.
|
|
func WithContextLen(n int) LoadOption {
|
|
return func(c *LoadConfig) { c.ContextLen = n }
|
|
}
|
|
|
|
// WithGPULayers sets how many layers to offload to GPU.
|
|
// -1 means all layers (full GPU offload).
|
|
func WithGPULayers(n int) LoadOption {
|
|
return func(c *LoadConfig) { c.GPULayers = n }
|
|
}
|
|
|
|
// WithParallelSlots sets the number of concurrent inference slots.
|
|
// Higher values allow parallel Generate/Chat calls but increase VRAM usage.
|
|
// 0 or unset uses the server default (typically 1).
|
|
func WithParallelSlots(n int) LoadOption {
|
|
return func(c *LoadConfig) { c.ParallelSlots = n }
|
|
}
|
|
|
|
// WithAdapterPath sets the path to a LoRA adapter directory.
|
|
// The directory should contain adapter_config.json and adapter safetensors files.
|
|
// The adapter weights are loaded and injected into the model at load time,
|
|
// enabling inference with a fine-tuned adapter without fusing/merging first.
|
|
func WithAdapterPath(path string) LoadOption {
|
|
return func(c *LoadConfig) { c.AdapterPath = path }
|
|
}
|
|
|
|
// ApplyLoadOpts builds a LoadConfig from options.
|
|
func ApplyLoadOpts(opts []LoadOption) LoadConfig {
|
|
cfg := LoadConfig{
|
|
GPULayers: -1, // default: full GPU offload
|
|
}
|
|
for _, o := range opts {
|
|
o(&cfg)
|
|
}
|
|
return cfg
|
|
}
|