From a1db312c7b28bb763b0a99d0f02b580fad538861 Mon Sep 17 00:00:00 2001 From: Virgil Date: Mon, 30 Mar 2026 18:59:41 +0000 Subject: [PATCH] fix(ax): surface atomic write failures Co-Authored-By: Virgil --- pkg/agentic/review_queue.go | 12 ++++- pkg/agentic/review_queue_extra_test.go | 72 ++++++++++++++++---------- pkg/agentic/status.go | 2 + pkg/monitor/harvest.go | 13 ++++- pkg/monitor/harvest_test.go | 6 +++ 5 files changed, 74 insertions(+), 31 deletions(-) diff --git a/pkg/agentic/review_queue.go b/pkg/agentic/review_queue.go index dee21ef..34b782e 100644 --- a/pkg/agentic/review_queue.go +++ b/pkg/agentic/review_queue.go @@ -369,10 +369,18 @@ func (s *PrepSubsystem) storeReviewOutput(repoDir, repo, reviewer, output string core.WriteAll(r.Value, core.Concat(jsonLine, "\n")) } -// saveRateLimitState persists rate limit info for cross-run awareness. +// saveRateLimitState writes the current rate-limit snapshot. +// +// s.saveRateLimitState(&RateLimitInfo{Limited: true, RetryAt: time.Now().Add(30 * time.Minute)}) func (s *PrepSubsystem) saveRateLimitState(info *RateLimitInfo) { path := core.JoinPath(HomeDir(), ".core", "coderabbit-ratelimit.json") - fs.WriteAtomic(path, core.JSONMarshalString(info)) + if r := fs.WriteAtomic(path, core.JSONMarshalString(info)); !r.OK { + if err, ok := r.Value.(error); ok { + core.Warn("reviewQueue: failed to persist rate limit state", "path", path, "reason", err) + return + } + core.Warn("reviewQueue: failed to persist rate limit state", "path", path) + } } // loadRateLimitState reads persisted rate limit info. diff --git a/pkg/agentic/review_queue_extra_test.go b/pkg/agentic/review_queue_extra_test.go index 101cd64..948f517 100644 --- a/pkg/agentic/review_queue_extra_test.go +++ b/pkg/agentic/review_queue_extra_test.go @@ -53,8 +53,8 @@ func TestReviewqueue_SaveLoadRateLimitState_Good_Roundtrip(t *testing.T) { // but save/load use DIR_HOME. Skip if not writable. s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } info := &RateLimitInfo{ @@ -79,8 +79,8 @@ func TestReviewqueue_StoreReviewOutput_Good(t *testing.T) { // but we can verify it doesn't panic s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } assert.NotPanics(t, func() { s.storeReviewOutput(t.TempDir(), "test-repo", "coderabbit", "No findings — LGTM") @@ -99,9 +99,9 @@ func TestReviewqueue_NoCandidates_Good(t *testing.T) { s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - codePath: root, - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + codePath: root, + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } _, out, err := s.reviewQueue(context.Background(), nil, ReviewQueueInput{DryRun: true}) @@ -135,8 +135,8 @@ func TestReviewqueue_StatusFiltered_Good(t *testing.T) { s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } _, out, err := s.status(context.Background(), nil, StatusInput{}) @@ -229,8 +229,8 @@ func TestReviewqueue_LoadRateLimitState_Ugly(t *testing.T) { s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } result := s.loadRateLimitState() @@ -243,8 +243,8 @@ func TestReviewqueue_BuildReviewCommand_Bad(t *testing.T) { // Empty reviewer string — defaults to coderabbit s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } cmd, args := s.buildReviewCommand("/tmp/repo", "") assert.Equal(t, "coderabbit", cmd) @@ -289,8 +289,8 @@ func TestReviewqueue_StoreReviewOutput_Bad(t *testing.T) { // Empty output s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } assert.NotPanics(t, func() { s.storeReviewOutput(t.TempDir(), "test-repo", "coderabbit", "") @@ -301,8 +301,8 @@ func TestReviewqueue_StoreReviewOutput_Ugly(t *testing.T) { // Very large output s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } largeOutput := strings.Repeat("Finding: something is wrong on this line\n", 10000) assert.NotPanics(t, func() { @@ -315,8 +315,8 @@ func TestReviewqueue_StoreReviewOutput_Ugly(t *testing.T) { func TestReviewqueue_SaveRateLimitState_Good(t *testing.T) { s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } info := &RateLimitInfo{ @@ -333,20 +333,38 @@ func TestReviewqueue_SaveRateLimitState_Bad(t *testing.T) { // Save nil info s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } assert.NotPanics(t, func() { s.saveRateLimitState(nil) }) } +func TestReviewqueue_SaveRateLimitState_Bad_WriteFailure(t *testing.T) { + t.Setenv("CORE_HOME", "/dev/null") + + s := &PrepSubsystem{ + ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), + } + info := &RateLimitInfo{ + Limited: true, + RetryAt: time.Now().Add(5 * time.Minute).Truncate(time.Second), + Message: "write failure", + } + assert.NotPanics(t, func() { + s.saveRateLimitState(info) + }) +} + func TestReviewqueue_SaveRateLimitState_Ugly(t *testing.T) { // Save with far-future RetryAt s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } info := &RateLimitInfo{ Limited: true, @@ -364,8 +382,8 @@ func TestReviewqueue_LoadRateLimitState_Good(t *testing.T) { // Write then load valid state s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } info := &RateLimitInfo{ @@ -389,8 +407,8 @@ func TestReviewqueue_LoadRateLimitState_Bad(t *testing.T) { // File doesn't exist — should return nil s := &PrepSubsystem{ ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}), - backoff: make(map[string]time.Time), - failCount: make(map[string]int), + backoff: make(map[string]time.Time), + failCount: make(map[string]int), } // loadRateLimitState reads from DIR_HOME/.core/coderabbit-ratelimit.json. diff --git a/pkg/agentic/status.go b/pkg/agentic/status.go index df8ac7b..30df84f 100644 --- a/pkg/agentic/status.go +++ b/pkg/agentic/status.go @@ -81,8 +81,10 @@ func writeStatusResult(wsDir string, status *WorkspaceStatus) core.Result { if r := fs.WriteAtomic(statusPath, core.JSONMarshalString(status)); !r.OK { err, _ := r.Value.(error) if err == nil { + core.Warn("agentic.writeStatus: failed to write status", "path", statusPath) return core.Result{Value: core.E("writeStatus", "failed to write status", nil), OK: false} } + core.Warn("agentic.writeStatus: failed to write status", "path", statusPath, "reason", err) return core.Result{Value: core.E("writeStatus", "failed to write status", err), OK: false} } return core.Result{OK: true} diff --git a/pkg/monitor/harvest.go b/pkg/monitor/harvest.go index 05de6d3..7d3ea11 100644 --- a/pkg/monitor/harvest.go +++ b/pkg/monitor/harvest.go @@ -245,7 +245,9 @@ func (m *Subsystem) pushBranch(srcDir, branch string) error { return nil } -// updateStatus updates the workspace status.json. +// updateStatus rewrites status.json after a harvest decision. +// +// updateStatus(wsDir, "ready-for-review", "") func updateStatus(wsDir, status, question string) { r := fs.Read(agentic.WorkspaceStatusPath(wsDir)) if !r.OK { @@ -265,5 +267,12 @@ func updateStatus(wsDir, status, question string) { } else { delete(st, "question") // clear stale question from previous state } - fs.WriteAtomic(agentic.WorkspaceStatusPath(wsDir), core.JSONMarshalString(st)) + statusPath := agentic.WorkspaceStatusPath(wsDir) + if r := fs.WriteAtomic(statusPath, core.JSONMarshalString(st)); !r.OK { + if err, ok := r.Value.(error); ok { + core.Warn("monitor.updateStatus: failed to write status", "path", statusPath, "reason", err) + return + } + core.Warn("monitor.updateStatus: failed to write status", "path", statusPath) + } } diff --git a/pkg/monitor/harvest_test.go b/pkg/monitor/harvest_test.go index f6ddd2a..9162314 100644 --- a/pkg/monitor/harvest_test.go +++ b/pkg/monitor/harvest_test.go @@ -127,6 +127,12 @@ func TestHarvest_CheckSafety_Bad_LargeFile(t *testing.T) { assert.Contains(t, reason, "huge.txt") } +func TestHarvest_UpdateStatus_Bad_WriteFailure(t *testing.T) { + assert.NotPanics(t, func() { + updateStatus("/dev/null/impossible", "ready-for-review", "") + }) +} + func TestHarvest_HarvestWorkspace_Good(t *testing.T) { _, wsDir := initTestRepo(t) writeStatus(t, wsDir, "completed", "test-repo", "agent/test-task")