fix: only retry startServer on process exit, not timeout

Distinguishes retryable failures (process exited, e.g. port conflict)
from non-retryable ones (60s timeout, e.g. stuck server). Avoids
3x timeout penalty.

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Claude 2026-02-19 21:43:06 +00:00
parent c50a8e9e9b
commit b7342ec819
No known key found for this signature in database
GPG key ID: AF404715446AEB41

View file

@ -131,8 +131,17 @@ func startServer(binary, modelPath string, gpuLayers, ctxSize int) (*server, err
return s, nil
}
_ = s.stop()
lastErr = fmt.Errorf("attempt %d: %w", attempt+1, err)
// Only retry if the process actually exited (e.g. port conflict).
// A timeout means the server is stuck, not a port issue.
select {
case <-s.exited:
_ = s.stop()
lastErr = fmt.Errorf("attempt %d: %w", attempt+1, err)
continue
default:
_ = s.stop()
return nil, fmt.Errorf("rocm: llama-server not ready: %w", err)
}
}
return nil, fmt.Errorf("rocm: server failed after %d attempts: %w", maxAttempts, lastErr)