agent/tests/cli/restart/Taskfile.yaml
Snider c6415aa53a feat(agent/state): OnStartup queue+registry restore from .core/db.duckdb (#537)
Per RFC §15.3: restart was losing in-flight queue + workspace registry.
"Ghost agents" and "lost queue" pain now fixed.

Lands:
* pkg/agentic/persist.go (NEW):
  - OnStartup(ctx, c): opens .core/db.duckdb via go-store, restores
    registry/queue/concurrency groups
  - Dead-PID detection: registry entries with status=running but
    !pidAlive(PID) → marked failed with question="dead worker on
    restart"; status.json files re-written to disk
  - Orphaned workspace cleanup: walk .core/workspace/, dir-exists +
    registry-says-completed → delete
  - OnShutdown(ctx): flushes in-memory registry + queue back to store
    before close
* pkg/agentic/prep.go — PrepSubsystem.OnStartup/OnShutdown wired
* pkg/agentic/persist_test.go — AX-10 covering queue restore,
  dead-worker reaping, shutdown persistence, invalid-store-payload,
  orphan cleanup
* tests/cli/restart/Taskfile.yaml — extended smoke seeds DuckDB state
  for queued workspace + dead running worker, asserts status.json
  reflects restore correctly

Sandbox blocked from go test by go.work conflicting dappco.re/go/api
replacements (pre-existing); gofmt clean. Supervisor's clean workspace
catches.

Co-authored-by: Codex <noreply@openai.com>
Closes tasks.lthn.sh/view.php?id=537
2026-04-26 00:04:56 +01:00

102 lines
4.3 KiB
YAML

version: "3"
# RFC §15.7 — `tests/cli/restart/` validates that dispatch state survives a
# kill+restart cycle without leaving ghost agents in the registry. The
# Taskfile path mirrors the surface under test (`core-agent restart` is not a
# command — restart is the lifecycle, validated by simulating a stale
# workspace and confirming `status` reports it as failed/queued, not running).
tasks:
test:
cmds:
- |
bash <<'EOF'
set -euo pipefail
source ../_lib/run.sh
go build -trimpath -ldflags="-s -w" -o bin/core-agent ../../../cmd/core-agent
# Use an isolated CORE_WORKSPACE so the test cannot disturb the
# operator's own .core/ directory.
workspace="$(mktemp -d)"
export CORE_WORKSPACE="$workspace"
export CORE_HOME="$workspace"
export DIR_HOME="$workspace"
# Simulate the persisted state a killed process leaves behind:
# one queued workspace with only db.duckdb state and one dead
# running worker that must be reaped to failed on restart.
queued_dir="$workspace/workspace/core/go-io/task-queued"
dead_dir="$workspace/workspace/core/go-io/task-dead"
mkdir -p "$queued_dir" "$dead_dir"
seed="$workspace/seed-restart-state.go"
cat >"$seed" <<'GO'
package main
import (
"os"
store "dappco.re/go/store"
)
func main() {
st, err := store.New(os.Args[1])
if err != nil {
panic(err)
}
defer st.Close()
must := func(err error) {
if err != nil {
panic(err)
}
}
must(st.Set("queue", "core/go-io/task-queued", `{"repo":"go-io","org":"core","task":"restart-queued","branch":"agent/restart-queued","agent":"codex:gpt-5.4","status":"queued","queued_at":"2026-04-14T12:00:00Z"}`))
must(st.Set("registry", "core/go-io/task-queued", `{"status":"queued","agent":"codex:gpt-5.4","repo":"go-io","org":"core","task":"restart-queued","branch":"agent/restart-queued","started_at":"2026-04-14T12:00:00Z","updated_at":"2026-04-14T12:00:00Z","runs":0}`))
must(st.Set("registry", "core/go-io/task-dead", `{"status":"running","agent":"codex:gpt-5.4","repo":"go-io","org":"core","task":"restart-dead","branch":"agent/restart-dead","pid":99999,"process_id":"dead-proc","started_at":"2026-04-14T12:00:00Z","updated_at":"2026-04-14T12:00:00Z","runs":1}`))
must(st.Set("concurrency", "codex", `{"running":1,"tracked":2,"snapshot_at":"2026-04-14T12:00:00Z"}`))
}
GO
go run "$seed" "$workspace/db.duckdb"
# Restart into a fresh process. Startup must restore the queued
# workspace from db.duckdb and reap the dead worker to failed.
output="$(mktemp)"
run_capture_all 0 "$output" ./bin/core-agent status
assert_contains "core/go-io/task-queued" "$output"
assert_contains "core/go-io/task-dead" "$output"
if ! grep -E 'queued[[:space:]]+codex:gpt-5.4[[:space:]]+go-io[[:space:]]+core/go-io/task-queued' "$output"; then
printf 'expected queued workspace restored from db.duckdb\n' >&2
cat "$output" >&2
exit 1
fi
if grep -E 'core/go-io/task-dead.*\brunning\b' "$output"; then
printf 'expected dead worker reaped on restart, still reports running\n' >&2
cat "$output" >&2
exit 1
fi
dead_status="$dead_dir/status.json"
queued_status="$queued_dir/status.json"
if ! grep -q '"status":[[:space:]]*"failed"' "$dead_status"; then
printf 'expected failed status persisted to %s\n' "$dead_status" >&2
cat "$dead_status" >&2
exit 1
fi
if ! grep -q '"question":[[:space:]]*"dead worker on restart"' "$dead_status"; then
printf 'expected dead-worker question persisted to %s\n' "$dead_status" >&2
cat "$dead_status" >&2
exit 1
fi
if ! grep -q '"status":[[:space:]]*"queued"' "$queued_status"; then
printf 'expected queued status restored to %s\n' "$queued_status" >&2
cat "$queued_status" >&2
exit 1
fi
EOF