feat: pass --parallel N to llama-server for concurrent inference slots

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Claude 2026-02-19 23:13:19 +00:00
parent 4b6cffb9c4
commit 72120bb200
No known key found for this signature in database
GPG key ID: AF404715446AEB41
3 changed files with 6 additions and 3 deletions

View file

@ -49,7 +49,7 @@ func (b *rocmBackend) LoadModel(path string, opts ...inference.LoadOption) (infe
ctxLen = int(min(meta.ContextLength, 4096))
}
srv, err := startServer(binary, path, cfg.GPULayers, ctxLen)
srv, err := startServer(binary, path, cfg.GPULayers, ctxLen, cfg.ParallelSlots)
if err != nil {
return nil, err
}

View file

@ -81,7 +81,7 @@ func serverEnv() []string {
// startServer spawns llama-server and waits for it to become ready.
// It selects a free port automatically, retrying up to 3 times if the
// process exits during startup (e.g. port conflict).
func startServer(binary, modelPath string, gpuLayers, ctxSize int) (*server, error) {
func startServer(binary, modelPath string, gpuLayers, ctxSize, parallelSlots int) (*server, error) {
if gpuLayers < 0 {
gpuLayers = 999
}
@ -104,6 +104,9 @@ func startServer(binary, modelPath string, gpuLayers, ctxSize int) (*server, err
if ctxSize > 0 {
args = append(args, "--ctx-size", strconv.Itoa(ctxSize))
}
if parallelSlots > 0 {
args = append(args, "--parallel", strconv.Itoa(parallelSlots))
}
cmd := exec.Command(binary, args...)
cmd.Env = serverEnv()

View file

@ -114,7 +114,7 @@ func TestGenerate_ServerDead(t *testing.T) {
func TestStartServer_RetriesOnProcessExit(t *testing.T) {
// /bin/false starts successfully but exits immediately with code 1.
// startServer should retry up to 3 times, then fail.
_, err := startServer("/bin/false", "/nonexistent/model.gguf", 999, 0)
_, err := startServer("/bin/false", "/nonexistent/model.gguf", 999, 0, 0)
require.Error(t, err)
assert.Contains(t, err.Error(), "failed after 3 attempts")
}