From a1db312c7b28bb763b0a99d0f02b580fad538861 Mon Sep 17 00:00:00 2001
From: Virgil <virgil@lethean.io>
Date: Mon, 30 Mar 2026 18:59:41 +0000
Subject: [PATCH] fix(ax): surface atomic write failures

Co-Authored-By: Virgil <virgil@lethean.io>
---
 pkg/agentic/review_queue.go            | 12 ++++-
 pkg/agentic/review_queue_extra_test.go | 72 ++++++++++++++++----------
 pkg/agentic/status.go                  |  2 +
 pkg/monitor/harvest.go                 | 13 ++++-
 pkg/monitor/harvest_test.go            |  6 +++
 5 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/pkg/agentic/review_queue.go b/pkg/agentic/review_queue.go
index dee21ef..34b782e 100644
--- a/pkg/agentic/review_queue.go
+++ b/pkg/agentic/review_queue.go
@@ -369,10 +369,18 @@ func (s *PrepSubsystem) storeReviewOutput(repoDir, repo, reviewer, output string
 	core.WriteAll(r.Value, core.Concat(jsonLine, "\n"))
 }
 
-// saveRateLimitState persists rate limit info for cross-run awareness.
+// saveRateLimitState writes the current rate-limit snapshot.
+//
+//	s.saveRateLimitState(&RateLimitInfo{Limited: true, RetryAt: time.Now().Add(30 * time.Minute)})
 func (s *PrepSubsystem) saveRateLimitState(info *RateLimitInfo) {
 	path := core.JoinPath(HomeDir(), ".core", "coderabbit-ratelimit.json")
-	fs.WriteAtomic(path, core.JSONMarshalString(info))
+	if r := fs.WriteAtomic(path, core.JSONMarshalString(info)); !r.OK {
+		if err, ok := r.Value.(error); ok {
+			core.Warn("reviewQueue: failed to persist rate limit state", "path", path, "reason", err)
+			return
+		}
+		core.Warn("reviewQueue: failed to persist rate limit state", "path", path)
+	}
 }
 
 // loadRateLimitState reads persisted rate limit info.
diff --git a/pkg/agentic/review_queue_extra_test.go b/pkg/agentic/review_queue_extra_test.go
index 101cd64..948f517 100644
--- a/pkg/agentic/review_queue_extra_test.go
+++ b/pkg/agentic/review_queue_extra_test.go
@@ -53,8 +53,8 @@ func TestReviewqueue_SaveLoadRateLimitState_Good_Roundtrip(t *testing.T) {
 	// but save/load use DIR_HOME. Skip if not writable.
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	info := &RateLimitInfo{
@@ -79,8 +79,8 @@ func TestReviewqueue_StoreReviewOutput_Good(t *testing.T) {
 	// but we can verify it doesn't panic
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 	assert.NotPanics(t, func() {
 		s.storeReviewOutput(t.TempDir(), "test-repo", "coderabbit", "No findings — LGTM")
@@ -99,9 +99,9 @@ func TestReviewqueue_NoCandidates_Good(t *testing.T) {
 
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		codePath:  root,
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		codePath:       root,
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	_, out, err := s.reviewQueue(context.Background(), nil, ReviewQueueInput{DryRun: true})
@@ -135,8 +135,8 @@ func TestReviewqueue_StatusFiltered_Good(t *testing.T) {
 
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	_, out, err := s.status(context.Background(), nil, StatusInput{})
@@ -229,8 +229,8 @@ func TestReviewqueue_LoadRateLimitState_Ugly(t *testing.T) {
 
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	result := s.loadRateLimitState()
@@ -243,8 +243,8 @@ func TestReviewqueue_BuildReviewCommand_Bad(t *testing.T) {
 	// Empty reviewer string — defaults to coderabbit
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 	cmd, args := s.buildReviewCommand("/tmp/repo", "")
 	assert.Equal(t, "coderabbit", cmd)
@@ -289,8 +289,8 @@ func TestReviewqueue_StoreReviewOutput_Bad(t *testing.T) {
 	// Empty output
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 	assert.NotPanics(t, func() {
 		s.storeReviewOutput(t.TempDir(), "test-repo", "coderabbit", "")
@@ -301,8 +301,8 @@ func TestReviewqueue_StoreReviewOutput_Ugly(t *testing.T) {
 	// Very large output
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 	largeOutput := strings.Repeat("Finding: something is wrong on this line\n", 10000)
 	assert.NotPanics(t, func() {
@@ -315,8 +315,8 @@ func TestReviewqueue_StoreReviewOutput_Ugly(t *testing.T) {
 func TestReviewqueue_SaveRateLimitState_Good(t *testing.T) {
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	info := &RateLimitInfo{
@@ -333,20 +333,38 @@ func TestReviewqueue_SaveRateLimitState_Bad(t *testing.T) {
 	// Save nil info
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 	assert.NotPanics(t, func() {
 		s.saveRateLimitState(nil)
 	})
 }
 
+func TestReviewqueue_SaveRateLimitState_Bad_WriteFailure(t *testing.T) {
+	t.Setenv("CORE_HOME", "/dev/null")
+
+	s := &PrepSubsystem{
+		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
+	}
+	info := &RateLimitInfo{
+		Limited: true,
+		RetryAt: time.Now().Add(5 * time.Minute).Truncate(time.Second),
+		Message: "write failure",
+	}
+	assert.NotPanics(t, func() {
+		s.saveRateLimitState(info)
+	})
+}
+
 func TestReviewqueue_SaveRateLimitState_Ugly(t *testing.T) {
 	// Save with far-future RetryAt
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 	info := &RateLimitInfo{
 		Limited: true,
@@ -364,8 +382,8 @@ func TestReviewqueue_LoadRateLimitState_Good(t *testing.T) {
 	// Write then load valid state
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	info := &RateLimitInfo{
@@ -389,8 +407,8 @@ func TestReviewqueue_LoadRateLimitState_Bad(t *testing.T) {
 	// File doesn't exist — should return nil
 	s := &PrepSubsystem{
 		ServiceRuntime: core.NewServiceRuntime(testCore, AgentOptions{}),
-		backoff:   make(map[string]time.Time),
-		failCount: make(map[string]int),
+		backoff:        make(map[string]time.Time),
+		failCount:      make(map[string]int),
 	}
 
 	// loadRateLimitState reads from DIR_HOME/.core/coderabbit-ratelimit.json.
diff --git a/pkg/agentic/status.go b/pkg/agentic/status.go
index df8ac7b..30df84f 100644
--- a/pkg/agentic/status.go
+++ b/pkg/agentic/status.go
@@ -81,8 +81,10 @@ func writeStatusResult(wsDir string, status *WorkspaceStatus) core.Result {
 	if r := fs.WriteAtomic(statusPath, core.JSONMarshalString(status)); !r.OK {
 		err, _ := r.Value.(error)
 		if err == nil {
+			core.Warn("agentic.writeStatus: failed to write status", "path", statusPath)
 			return core.Result{Value: core.E("writeStatus", "failed to write status", nil), OK: false}
 		}
+		core.Warn("agentic.writeStatus: failed to write status", "path", statusPath, "reason", err)
 		return core.Result{Value: core.E("writeStatus", "failed to write status", err), OK: false}
 	}
 	return core.Result{OK: true}
diff --git a/pkg/monitor/harvest.go b/pkg/monitor/harvest.go
index 05de6d3..7d3ea11 100644
--- a/pkg/monitor/harvest.go
+++ b/pkg/monitor/harvest.go
@@ -245,7 +245,9 @@ func (m *Subsystem) pushBranch(srcDir, branch string) error {
 	return nil
 }
 
-// updateStatus updates the workspace status.json.
+// updateStatus rewrites status.json after a harvest decision.
+//
+//	updateStatus(wsDir, "ready-for-review", "")
 func updateStatus(wsDir, status, question string) {
 	r := fs.Read(agentic.WorkspaceStatusPath(wsDir))
 	if !r.OK {
@@ -265,5 +267,12 @@ func updateStatus(wsDir, status, question string) {
 	} else {
 		delete(st, "question") // clear stale question from previous state
 	}
-	fs.WriteAtomic(agentic.WorkspaceStatusPath(wsDir), core.JSONMarshalString(st))
+	statusPath := agentic.WorkspaceStatusPath(wsDir)
+	if r := fs.WriteAtomic(statusPath, core.JSONMarshalString(st)); !r.OK {
+		if err, ok := r.Value.(error); ok {
+			core.Warn("monitor.updateStatus: failed to write status", "path", statusPath, "reason", err)
+			return
+		}
+		core.Warn("monitor.updateStatus: failed to write status", "path", statusPath)
+	}
 }
diff --git a/pkg/monitor/harvest_test.go b/pkg/monitor/harvest_test.go
index f6ddd2a..9162314 100644
--- a/pkg/monitor/harvest_test.go
+++ b/pkg/monitor/harvest_test.go
@@ -127,6 +127,12 @@ func TestHarvest_CheckSafety_Bad_LargeFile(t *testing.T) {
 	assert.Contains(t, reason, "huge.txt")
 }
 
+func TestHarvest_UpdateStatus_Bad_WriteFailure(t *testing.T) {
+	assert.NotPanics(t, func() {
+		updateStatus("/dev/null/impossible", "ready-for-review", "")
+	})
+}
+
 func TestHarvest_HarvestWorkspace_Good(t *testing.T) {
 	_, wsDir := initTestRepo(t)
 	writeStatus(t, wsDir, "completed", "test-repo", "agent/test-task")