go/pkg/process/supervisor_test.go
Snider 8410093400 feat(process): add Supervisor for managed service lifecycle
Adds a Supervisor layer to pkg/process that manages long-running
processes and goroutines with automatic restart, panic recovery,
and graceful shutdown. Supports both external processes (DaemonSpec)
and Go functions (GoSpec) with configurable restart policies.

Also exposes AddHealthCheck on the Daemon struct so supervised
services can wire their status into the daemon health endpoint.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-17 16:14:49 +00:00

335 lines
7 KiB
Go

package process
import (
"context"
"fmt"
"sync/atomic"
"testing"
"time"
)
func TestSupervisor_GoFunc_Good(t *testing.T) {
sup := NewSupervisor(nil)
var count atomic.Int32
sup.RegisterFunc(GoSpec{
Name: "counter",
Func: func(ctx context.Context) error {
count.Add(1)
<-ctx.Done()
return nil
},
Restart: RestartPolicy{Delay: 10 * time.Millisecond, MaxRestarts: -1},
})
sup.Start()
time.Sleep(50 * time.Millisecond)
status, err := sup.Status("counter")
if err != nil {
t.Fatal(err)
}
if !status.Running {
t.Error("expected counter to be running")
}
if status.Type != "goroutine" {
t.Errorf("expected type goroutine, got %s", status.Type)
}
sup.Stop()
if c := count.Load(); c < 1 {
t.Errorf("expected counter >= 1, got %d", c)
}
}
func TestSupervisor_GoFunc_Restart_Good(t *testing.T) {
sup := NewSupervisor(nil)
var runs atomic.Int32
sup.RegisterFunc(GoSpec{
Name: "crasher",
Func: func(ctx context.Context) error {
n := runs.Add(1)
if n <= 3 {
return fmt.Errorf("crash #%d", n)
}
// After 3 crashes, stay running
<-ctx.Done()
return nil
},
Restart: RestartPolicy{Delay: 5 * time.Millisecond, MaxRestarts: -1},
})
sup.Start()
// Wait for restarts
time.Sleep(200 * time.Millisecond)
status, _ := sup.Status("crasher")
if status.RestartCount < 3 {
t.Errorf("expected at least 3 restarts, got %d", status.RestartCount)
}
if !status.Running {
t.Error("expected crasher to be running after recovering")
}
sup.Stop()
}
func TestSupervisor_GoFunc_MaxRestarts_Good(t *testing.T) {
sup := NewSupervisor(nil)
sup.RegisterFunc(GoSpec{
Name: "limited",
Func: func(ctx context.Context) error {
return fmt.Errorf("always fail")
},
Restart: RestartPolicy{Delay: 5 * time.Millisecond, MaxRestarts: 2},
})
sup.Start()
time.Sleep(200 * time.Millisecond)
status, _ := sup.Status("limited")
if status.Running {
t.Error("expected limited to have stopped after max restarts")
}
// The function runs once (initial) + 2 restarts = restartCount should be 3
// (restartCount increments each time the function exits)
if status.RestartCount > 3 {
t.Errorf("expected restartCount <= 3, got %d", status.RestartCount)
}
sup.Stop()
}
func TestSupervisor_GoFunc_Panic_Good(t *testing.T) {
sup := NewSupervisor(nil)
var runs atomic.Int32
sup.RegisterFunc(GoSpec{
Name: "panicker",
Func: func(ctx context.Context) error {
n := runs.Add(1)
if n == 1 {
panic("boom")
}
<-ctx.Done()
return nil
},
Restart: RestartPolicy{Delay: 5 * time.Millisecond, MaxRestarts: 3},
})
sup.Start()
time.Sleep(100 * time.Millisecond)
status, _ := sup.Status("panicker")
if !status.Running {
t.Error("expected panicker to recover and be running")
}
if runs.Load() < 2 {
t.Error("expected at least 2 runs (1 panic + 1 recovery)")
}
sup.Stop()
}
func TestSupervisor_Statuses_Good(t *testing.T) {
sup := NewSupervisor(nil)
sup.RegisterFunc(GoSpec{
Name: "a",
Func: func(ctx context.Context) error { <-ctx.Done(); return nil },
Restart: RestartPolicy{MaxRestarts: -1},
})
sup.RegisterFunc(GoSpec{
Name: "b",
Func: func(ctx context.Context) error { <-ctx.Done(); return nil },
Restart: RestartPolicy{MaxRestarts: -1},
})
sup.Start()
time.Sleep(50 * time.Millisecond)
statuses := sup.Statuses()
if len(statuses) != 2 {
t.Errorf("expected 2 statuses, got %d", len(statuses))
}
if !statuses["a"].Running || !statuses["b"].Running {
t.Error("expected both units running")
}
sup.Stop()
}
func TestSupervisor_UnitNames_Good(t *testing.T) {
sup := NewSupervisor(nil)
sup.RegisterFunc(GoSpec{
Name: "alpha",
Func: func(ctx context.Context) error { <-ctx.Done(); return nil },
})
sup.RegisterFunc(GoSpec{
Name: "beta",
Func: func(ctx context.Context) error { <-ctx.Done(); return nil },
})
names := sup.UnitNames()
if len(names) != 2 {
t.Errorf("expected 2 names, got %d", len(names))
}
}
func TestSupervisor_Status_Bad(t *testing.T) {
sup := NewSupervisor(nil)
_, err := sup.Status("nonexistent")
if err == nil {
t.Error("expected error for nonexistent unit")
}
}
func TestSupervisor_Restart_Good(t *testing.T) {
sup := NewSupervisor(nil)
var runs atomic.Int32
sup.RegisterFunc(GoSpec{
Name: "restartable",
Func: func(ctx context.Context) error {
runs.Add(1)
<-ctx.Done()
return nil
},
Restart: RestartPolicy{Delay: 5 * time.Millisecond, MaxRestarts: -1},
})
sup.Start()
time.Sleep(50 * time.Millisecond)
if err := sup.Restart("restartable"); err != nil {
t.Fatal(err)
}
time.Sleep(100 * time.Millisecond)
if runs.Load() < 2 {
t.Errorf("expected at least 2 runs after restart, got %d", runs.Load())
}
sup.Stop()
}
func TestSupervisor_Restart_Bad(t *testing.T) {
sup := NewSupervisor(nil)
err := sup.Restart("nonexistent")
if err == nil {
t.Error("expected error for nonexistent unit")
}
}
func TestSupervisor_StopUnit_Good(t *testing.T) {
sup := NewSupervisor(nil)
sup.RegisterFunc(GoSpec{
Name: "stoppable",
Func: func(ctx context.Context) error {
<-ctx.Done()
return nil
},
Restart: RestartPolicy{Delay: 5 * time.Millisecond, MaxRestarts: -1},
})
sup.Start()
time.Sleep(50 * time.Millisecond)
if err := sup.StopUnit("stoppable"); err != nil {
t.Fatal(err)
}
time.Sleep(100 * time.Millisecond)
status, _ := sup.Status("stoppable")
if status.Running {
t.Error("expected unit to be stopped")
}
sup.Stop()
}
func TestSupervisor_StopUnit_Bad(t *testing.T) {
sup := NewSupervisor(nil)
err := sup.StopUnit("nonexistent")
if err == nil {
t.Error("expected error for nonexistent unit")
}
}
func TestSupervisor_StartIdempotent_Good(t *testing.T) {
sup := NewSupervisor(nil)
var count atomic.Int32
sup.RegisterFunc(GoSpec{
Name: "once",
Func: func(ctx context.Context) error {
count.Add(1)
<-ctx.Done()
return nil
},
})
sup.Start()
sup.Start() // Should be no-op
sup.Start() // Should be no-op
time.Sleep(50 * time.Millisecond)
if count.Load() != 1 {
t.Errorf("expected exactly 1 run, got %d", count.Load())
}
sup.Stop()
}
func TestSupervisor_NoRestart_Good(t *testing.T) {
sup := NewSupervisor(nil)
var runs atomic.Int32
sup.RegisterFunc(GoSpec{
Name: "oneshot",
Func: func(ctx context.Context) error {
runs.Add(1)
return nil // Exit immediately
},
Restart: RestartPolicy{Delay: 5 * time.Millisecond, MaxRestarts: 0},
})
sup.Start()
time.Sleep(100 * time.Millisecond)
status, _ := sup.Status("oneshot")
if status.Running {
t.Error("expected oneshot to not be running")
}
// Should run once (initial) then stop. restartCount will be 1
// (incremented after the initial run exits).
if runs.Load() != 1 {
t.Errorf("expected exactly 1 run, got %d", runs.Load())
}
sup.Stop()
}
func TestSupervisor_Register_Ugly(t *testing.T) {
sup := NewSupervisor(nil)
defer func() {
if r := recover(); r == nil {
t.Error("expected panic when registering process daemon without service")
}
}()
sup.Register(DaemonSpec{
Name: "will-panic",
RunOptions: RunOptions{Command: "echo"},
})
}