feat: add ParallelSlots to LoadConfig for concurrent inference

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Claude 2026-02-19 23:12:29 +00:00
parent 07cd917259
commit 3719734f56
No known key found for this signature in database
GPG key ID: AF404715446AEB41

View file

@ -62,9 +62,10 @@ func ApplyGenerateOpts(opts []GenerateOption) GenerateConfig {
// LoadConfig holds model loading parameters.
type LoadConfig struct {
Backend string // "metal", "rocm", "llama_cpp" (empty = auto-detect)
ContextLen int // Context window size (0 = model default)
GPULayers int // Number of layers to offload to GPU (-1 = all, 0 = none)
Backend string // "metal", "rocm", "llama_cpp" (empty = auto-detect)
ContextLen int // Context window size (0 = model default)
GPULayers int // Number of layers to offload to GPU (-1 = all, 0 = none)
ParallelSlots int // Number of concurrent inference slots (0 = server default)
}
// LoadOption configures model loading.
@ -86,6 +87,13 @@ func WithGPULayers(n int) LoadOption {
return func(c *LoadConfig) { c.GPULayers = n }
}
// WithParallelSlots sets the number of concurrent inference slots.
// Higher values allow parallel Generate/Chat calls but increase VRAM usage.
// 0 or unset uses the server default (typically 1).
func WithParallelSlots(n int) LoadOption {
return func(c *LoadConfig) { c.ParallelSlots = n }
}
// ApplyLoadOpts builds a LoadConfig from options.
func ApplyLoadOpts(opts []LoadOption) LoadConfig {
cfg := LoadConfig{