fix: only retry startServer on process exit, not timeout
Distinguishes retryable failures (process exited, e.g. port conflict) from non-retryable ones (60s timeout, e.g. stuck server). Avoids 3x timeout penalty. Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
c50a8e9e9b
commit
b7342ec819
1 changed files with 11 additions and 2 deletions
13
server.go
13
server.go
|
|
@ -131,8 +131,17 @@ func startServer(binary, modelPath string, gpuLayers, ctxSize int) (*server, err
|
|||
return s, nil
|
||||
}
|
||||
|
||||
_ = s.stop()
|
||||
lastErr = fmt.Errorf("attempt %d: %w", attempt+1, err)
|
||||
// Only retry if the process actually exited (e.g. port conflict).
|
||||
// A timeout means the server is stuck, not a port issue.
|
||||
select {
|
||||
case <-s.exited:
|
||||
_ = s.stop()
|
||||
lastErr = fmt.Errorf("attempt %d: %w", attempt+1, err)
|
||||
continue
|
||||
default:
|
||||
_ = s.stop()
|
||||
return nil, fmt.Errorf("rocm: llama-server not ready: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("rocm: server failed after %d attempts: %w", maxAttempts, lastErr)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue