diff --git a/health.go b/health.go index fd6adfe..923fbed 100644 --- a/health.go +++ b/health.go @@ -3,8 +3,10 @@ package process import ( "context" "fmt" + "io" "net" "net/http" + "strings" "sync" "time" @@ -116,21 +118,40 @@ func (h *HealthServer) Addr() string { // WaitForHealth polls a health endpoint until it responds 200 or the timeout // (in milliseconds) expires. Returns true if healthy, false on timeout. func WaitForHealth(addr string, timeoutMs int) bool { + ok, _ := ProbeHealth(addr, timeoutMs) + return ok +} + +// ProbeHealth polls a health endpoint until it responds 200 or the timeout +// (in milliseconds) expires. Returns the health status and the last observed +// failure reason if the endpoint never becomes healthy. +func ProbeHealth(addr string, timeoutMs int) (bool, string) { deadline := time.Now().Add(time.Duration(timeoutMs) * time.Millisecond) url := fmt.Sprintf("http://%s/health", addr) client := &http.Client{Timeout: 2 * time.Second} + var lastReason string for time.Now().Before(deadline) { resp, err := client.Get(url) if err == nil { - resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() if resp.StatusCode == http.StatusOK { - return true + return true, "" } + lastReason = strings.TrimSpace(string(body)) + if lastReason == "" { + lastReason = resp.Status + } + } else { + lastReason = err.Error() } time.Sleep(200 * time.Millisecond) } - return false + if lastReason == "" { + lastReason = "health check timed out" + } + return false, lastReason } diff --git a/pkg/api/provider.go b/pkg/api/provider.go index 4397912..e2a9183 100644 --- a/pkg/api/provider.go +++ b/pkg/api/provider.go @@ -140,13 +140,14 @@ func (p *ProcessProvider) Describe() []api.RouteDescription { Method: "GET", Path: "/daemons/:code/:daemon/health", Summary: "Check daemon health", - Description: "Probes the daemon's health endpoint and returns the result.", + Description: "Probes the daemon's health endpoint and returns the result, including a failure reason when unhealthy.", Tags: []string{"process"}, Response: map[string]any{ "type": "object", "properties": map[string]any{ "healthy": map[string]any{"type": "boolean"}, "address": map[string]any{"type": "string"}, + "reason": map[string]any{"type": "string"}, }, }, }, @@ -232,18 +233,22 @@ func (p *ProcessProvider) healthCheck(c *gin.Context) { return } - healthy := process.WaitForHealth(entry.Health, 2000) + healthy, reason := process.ProbeHealth(entry.Health, 2000) result := map[string]any{ "healthy": healthy, "address": entry.Health, } + if !healthy && reason != "" { + result["reason"] = reason + } // Emit health event p.emitEvent("process.daemon.health", map[string]any{ "code": code, "daemon": daemon, "healthy": healthy, + "reason": reason, }) statusCode := http.StatusOK diff --git a/pkg/api/provider_test.go b/pkg/api/provider_test.go index eac3592..6a6f42c 100644 --- a/pkg/api/provider_test.go +++ b/pkg/api/provider_test.go @@ -6,6 +6,8 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "os" + "strings" "testing" goapi "dappco.re/go/core/api" @@ -84,6 +86,43 @@ func TestProcessProvider_GetDaemon_Bad(t *testing.T) { assert.Equal(t, http.StatusNotFound, w.Code) } +func TestProcessProvider_HealthCheck_Bad(t *testing.T) { + dir := t.TempDir() + registry := newTestRegistry(dir) + + healthSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte("upstream health check failed")) + })) + defer healthSrv.Close() + + hostPort := strings.TrimPrefix(healthSrv.URL, "http://") + require.NoError(t, registry.Register(process.DaemonEntry{ + Code: "test", + Daemon: "broken", + PID: os.Getpid(), + Health: hostPort, + })) + + p := processapi.NewProvider(registry, nil) + + r := setupRouter(p) + w := httptest.NewRecorder() + req, _ := http.NewRequest("GET", "/api/process/daemons/test/broken/health", nil) + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + + var resp goapi.Response[map[string]any] + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + require.True(t, resp.Success) + + assert.Equal(t, false, resp.Data["healthy"]) + assert.Equal(t, hostPort, resp.Data["address"]) + assert.Equal(t, "upstream health check failed", resp.Data["reason"]) +} + func TestProcessProvider_RegistersAsRouteGroup_Good(t *testing.T) { p := processapi.NewProvider(nil, nil)