feat: pass --parallel N to llama-server for concurrent inference slots

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-19 23:13:19 +00:00 · 2026-02-19 23:13:19 +00:00 · 72120bb200
commit 72120bb200
parent 4b6cffb9c4
3 changed files with 6 additions and 3 deletions
--- a/backend.go
+++ b/backend.go
@ -49,7 +49,7 @@ func (b *rocmBackend) LoadModel(path string, opts ...inference.LoadOption) (infe
 		ctxLen = int(min(meta.ContextLength, 4096))
 	}

-	srv, err := startServer(binary, path, cfg.GPULayers, ctxLen)
+	srv, err := startServer(binary, path, cfg.GPULayers, ctxLen, cfg.ParallelSlots)
 	if err != nil {
 		return nil, err
 	}
--- a/server.go
+++ b/server.go
@ -81,7 +81,7 @@ func serverEnv() []string {
 // startServer spawns llama-server and waits for it to become ready.
 // It selects a free port automatically, retrying up to 3 times if the
 // process exits during startup (e.g. port conflict).
-func startServer(binary, modelPath string, gpuLayers, ctxSize int) (*server, error) {
+func startServer(binary, modelPath string, gpuLayers, ctxSize, parallelSlots int) (*server, error) {
 	if gpuLayers < 0 {
 		gpuLayers = 999
 	}
@ -104,6 +104,9 @@ func startServer(binary, modelPath string, gpuLayers, ctxSize int) (*server, err
 		if ctxSize > 0 {
 			args = append(args, "--ctx-size", strconv.Itoa(ctxSize))
 		}
+		if parallelSlots > 0 {
+			args = append(args, "--parallel", strconv.Itoa(parallelSlots))
+		}

 		cmd := exec.Command(binary, args...)
 		cmd.Env = serverEnv()
--- a/server_test.go
+++ b/server_test.go
@ -114,7 +114,7 @@ func TestGenerate_ServerDead(t *testing.T) {
 func TestStartServer_RetriesOnProcessExit(t *testing.T) {
 	// /bin/false starts successfully but exits immediately with code 1.
 	// startServer should retry up to 3 times, then fail.
-	_, err := startServer("/bin/false", "/nonexistent/model.gguf", 999, 0)
+	_, err := startServer("/bin/false", "/nonexistent/model.gguf", 999, 0, 0)
 	require.Error(t, err)
 	assert.Contains(t, err.Error(), "failed after 3 attempts")
 }