feat(process): expose health probe failure reasons
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
5142114e89
commit
214cf4cfa8
3 changed files with 70 additions and 5 deletions
27
health.go
27
health.go
|
|
@ -3,8 +3,10 @@ package process
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
|
@ -116,21 +118,40 @@ func (h *HealthServer) Addr() string {
|
|||
// WaitForHealth polls a health endpoint until it responds 200 or the timeout
|
||||
// (in milliseconds) expires. Returns true if healthy, false on timeout.
|
||||
func WaitForHealth(addr string, timeoutMs int) bool {
|
||||
ok, _ := ProbeHealth(addr, timeoutMs)
|
||||
return ok
|
||||
}
|
||||
|
||||
// ProbeHealth polls a health endpoint until it responds 200 or the timeout
|
||||
// (in milliseconds) expires. Returns the health status and the last observed
|
||||
// failure reason if the endpoint never becomes healthy.
|
||||
func ProbeHealth(addr string, timeoutMs int) (bool, string) {
|
||||
deadline := time.Now().Add(time.Duration(timeoutMs) * time.Millisecond)
|
||||
url := fmt.Sprintf("http://%s/health", addr)
|
||||
|
||||
client := &http.Client{Timeout: 2 * time.Second}
|
||||
var lastReason string
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
resp, err := client.Get(url)
|
||||
if err == nil {
|
||||
resp.Body.Close()
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusOK {
|
||||
return true
|
||||
return true, ""
|
||||
}
|
||||
lastReason = strings.TrimSpace(string(body))
|
||||
if lastReason == "" {
|
||||
lastReason = resp.Status
|
||||
}
|
||||
} else {
|
||||
lastReason = err.Error()
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
|
||||
return false
|
||||
if lastReason == "" {
|
||||
lastReason = "health check timed out"
|
||||
}
|
||||
return false, lastReason
|
||||
}
|
||||
|
|
|
|||
|
|
@ -140,13 +140,14 @@ func (p *ProcessProvider) Describe() []api.RouteDescription {
|
|||
Method: "GET",
|
||||
Path: "/daemons/:code/:daemon/health",
|
||||
Summary: "Check daemon health",
|
||||
Description: "Probes the daemon's health endpoint and returns the result.",
|
||||
Description: "Probes the daemon's health endpoint and returns the result, including a failure reason when unhealthy.",
|
||||
Tags: []string{"process"},
|
||||
Response: map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"healthy": map[string]any{"type": "boolean"},
|
||||
"address": map[string]any{"type": "string"},
|
||||
"reason": map[string]any{"type": "string"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
|
@ -232,18 +233,22 @@ func (p *ProcessProvider) healthCheck(c *gin.Context) {
|
|||
return
|
||||
}
|
||||
|
||||
healthy := process.WaitForHealth(entry.Health, 2000)
|
||||
healthy, reason := process.ProbeHealth(entry.Health, 2000)
|
||||
|
||||
result := map[string]any{
|
||||
"healthy": healthy,
|
||||
"address": entry.Health,
|
||||
}
|
||||
if !healthy && reason != "" {
|
||||
result["reason"] = reason
|
||||
}
|
||||
|
||||
// Emit health event
|
||||
p.emitEvent("process.daemon.health", map[string]any{
|
||||
"code": code,
|
||||
"daemon": daemon,
|
||||
"healthy": healthy,
|
||||
"reason": reason,
|
||||
})
|
||||
|
||||
statusCode := http.StatusOK
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import (
|
|||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
goapi "dappco.re/go/core/api"
|
||||
|
|
@ -84,6 +86,43 @@ func TestProcessProvider_GetDaemon_Bad(t *testing.T) {
|
|||
assert.Equal(t, http.StatusNotFound, w.Code)
|
||||
}
|
||||
|
||||
func TestProcessProvider_HealthCheck_Bad(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
registry := newTestRegistry(dir)
|
||||
|
||||
healthSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
_, _ = w.Write([]byte("upstream health check failed"))
|
||||
}))
|
||||
defer healthSrv.Close()
|
||||
|
||||
hostPort := strings.TrimPrefix(healthSrv.URL, "http://")
|
||||
require.NoError(t, registry.Register(process.DaemonEntry{
|
||||
Code: "test",
|
||||
Daemon: "broken",
|
||||
PID: os.Getpid(),
|
||||
Health: hostPort,
|
||||
}))
|
||||
|
||||
p := processapi.NewProvider(registry, nil)
|
||||
|
||||
r := setupRouter(p)
|
||||
w := httptest.NewRecorder()
|
||||
req, _ := http.NewRequest("GET", "/api/process/daemons/test/broken/health", nil)
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
assert.Equal(t, http.StatusServiceUnavailable, w.Code)
|
||||
|
||||
var resp goapi.Response[map[string]any]
|
||||
err := json.Unmarshal(w.Body.Bytes(), &resp)
|
||||
require.NoError(t, err)
|
||||
require.True(t, resp.Success)
|
||||
|
||||
assert.Equal(t, false, resp.Data["healthy"])
|
||||
assert.Equal(t, hostPort, resp.Data["address"])
|
||||
assert.Equal(t, "upstream health check failed", resp.Data["reason"])
|
||||
}
|
||||
|
||||
func TestProcessProvider_RegistersAsRouteGroup_Good(t *testing.T) {
|
||||
p := processapi.NewProvider(nil, nil)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue