From 63c2ac96cdc2e2b61bb23813a5cc0276eada0af5 Mon Sep 17 00:00:00 2001 From: Curtis 'Fjord' Hawthorne Date: Tue, 24 Feb 2026 17:12:02 -0800 Subject: [PATCH] fix(js_repl): surface uncaught kernel errors and reset cleanly (#12636) ## Summary Improve `js_repl` behavior when the Node kernel hits a process-level failure (for example, an uncaught exception or unhandled Promise rejection). Instead of only surfacing a generic `js_repl kernel exited unexpectedly` after stdout EOF, `js_repl` now returns a clearer exec error for the active request, then resets the kernel cleanly. ## Why Some sandbox-denied operations can trigger Node errors that become process-level failures (for example, an unhandled EventEmitter `'error'` event). In that case: - the kernel process exits, - the host sees stdout EOF, - the user gets a generic kernel-exit error, - and the next request can briefly race with stale kernel state. This change improves that failure mode without monkeypatching Node APIs. ## Changes ### Kernel-side (`js_repl` Node process) - Add process-level handlers for: - `uncaughtException` - `unhandledRejection` - When one of these fires: - best-effort emit a normal `exec_result` error for the active exec - include actionable guidance to catch/handle async errors (including Promise rejections and EventEmitter `'error'` events) - exit intentionally so the host can reset/restart the kernel ### Host-side (`JsReplManager`) - Clear dead kernel state as soon as the stdout reader observes unexpected kernel exit/EOF. - This lets the next `js_repl` exec start a fresh kernel instead of hitting a stale broken-pipe path. ### Tests - Add regression coverage for: - uncaught async exception -> exec error + kernel recovery on next exec - Update forced-kernel-exit test to validate recovery behavior (next exec restarts cleanly) ## Impact - Better user-facing error for kernel crashes caused by uncaught/unhandled async failures. - Cleaner recovery behavior after kernel exit. ## Validation - `cargo test -p codex-core --lib tools::js_repl::tests::js_repl_uncaught_exception_returns_exec_error_and_recovers -- --exact` - `cargo test -p codex-core --lib tools::js_repl::tests::js_repl_forced_kernel_exit_recovers_on_next_exec -- --exact` - `just fmt` --- codex-rs/core/src/tools/js_repl/kernel.js | 63 ++++++++++ codex-rs/core/src/tools/js_repl/mod.rs | 141 ++++++++++++++++++++-- 2 files changed, 194 insertions(+), 10 deletions(-) diff --git a/codex-rs/core/src/tools/js_repl/kernel.js b/codex-rs/core/src/tools/js_repl/kernel.js index 283a9f451..f5459fde7 100644 --- a/codex-rs/core/src/tools/js_repl/kernel.js +++ b/codex-rs/core/src/tools/js_repl/kernel.js @@ -84,6 +84,8 @@ let previousModule = null; /** @type {Binding[]} */ let previousBindings = []; let cellCounter = 0; +let activeExecId = null; +let fatalExitScheduled = false; const builtinModuleSet = new Set([ ...builtinModules, @@ -395,6 +397,54 @@ function send(message) { process.stdout.write("\n"); } +function formatErrorMessage(error) { + if (error && typeof error === "object" && "message" in error) { + return error.message ? String(error.message) : String(error); + } + return String(error); +} + +function sendFatalExecResultSync(kind, error) { + if (!activeExecId) { + return; + } + const payload = { + type: "exec_result", + id: activeExecId, + ok: false, + output: "", + error: `js_repl kernel ${kind}: ${formatErrorMessage(error)}; kernel reset. Catch or handle async errors (including Promise rejections and EventEmitter 'error' events) to avoid kernel termination.`, + }; + try { + fs.writeSync(process.stdout.fd, `${JSON.stringify(payload)}\n`); + } catch { + // Best effort only; the host will still surface stdout EOF diagnostics. + } +} + +function scheduleFatalExit(kind, error) { + if (fatalExitScheduled) { + process.exitCode = 1; + return; + } + fatalExitScheduled = true; + sendFatalExecResultSync(kind, error); + + try { + fs.writeSync( + process.stderr.fd, + `js_repl kernel ${kind}: ${formatErrorMessage(error)}\n`, + ); + } catch { + // ignore + } + + // The host will observe stdout EOF, reset kernel state, and restart on demand. + setImmediate(() => { + process.exit(1); + }); +} + function formatLog(args) { return args .map((arg) => @@ -431,6 +481,7 @@ function withCapturedConsole(ctx, fn) { } async function handleExec(message) { + activeExecId = message.id; const tool = (toolName, args) => { if (typeof toolName !== "string" || !toolName) { return Promise.reject(new Error("codex.tool expects a tool name string")); @@ -527,6 +578,10 @@ async function handleExec(message) { output: "", error: error && error.message ? error.message : String(error), }); + } finally { + if (activeExecId === message.id) { + activeExecId = null; + } } } @@ -540,6 +595,14 @@ function handleToolResult(message) { let queue = Promise.resolve(); +process.on("uncaughtException", (error) => { + scheduleFatalExit("uncaught exception", error); +}); + +process.on("unhandledRejection", (reason) => { + scheduleFatalExit("unhandled rejection", reason); +}); + const input = createInterface({ input: process.stdin, crlfDelay: Infinity }); input.on("line", (line) => { if (!line.trim()) { diff --git a/codex-rs/core/src/tools/js_repl/mod.rs b/codex-rs/core/src/tools/js_repl/mod.rs index 6e6bb6228..a5bd81049 100644 --- a/codex-rs/core/src/tools/js_repl/mod.rs +++ b/codex-rs/core/src/tools/js_repl/mod.rs @@ -269,7 +269,7 @@ pub struct JsReplManager { node_path: Option, node_module_dirs: Vec, tmp_dir: tempfile::TempDir, - kernel: Mutex>, + kernel: Arc>>, exec_lock: Arc, exec_tool_calls: Arc>>, } @@ -287,7 +287,7 @@ impl JsReplManager { node_path, node_module_dirs, tmp_dir, - kernel: Mutex::new(None), + kernel: Arc::new(Mutex::new(None)), exec_lock: Arc::new(tokio::sync::Semaphore::new(1)), exec_tool_calls: Arc::new(Mutex::new(HashMap::new())), }); @@ -669,6 +669,7 @@ impl JsReplManager { tokio::spawn(Self::read_stdout( stdout, Arc::clone(&child), + Arc::clone(&self.kernel), Arc::clone(&recent_stderr), Arc::clone(&pending_execs), Arc::clone(&exec_contexts), @@ -811,6 +812,7 @@ impl JsReplManager { async fn read_stdout( stdout: tokio::process::ChildStdout, child: Arc>, + manager_kernel: Arc>>, recent_stderr: Arc>>, pending_execs: Arc>>>, exec_contexts: Arc>>, @@ -964,6 +966,16 @@ impl JsReplManager { .clone() .unwrap_or_else(|| "js_repl kernel exited unexpectedly".to_string()); + { + let mut kernel = manager_kernel.lock().await; + let should_clear = kernel + .as_ref() + .is_some_and(|state| Arc::ptr_eq(&state.child, &child)); + if should_clear { + kernel.take(); + } + } + let mut pending = pending_execs.lock().await; let pending_exec_ids = pending.keys().cloned().collect::>(); for (_id, tx) in pending.drain() { @@ -1743,7 +1755,7 @@ mod tests { } #[tokio::test] - async fn js_repl_kernel_failure_includes_model_diagnostics() -> anyhow::Result<()> { + async fn js_repl_forced_kernel_exit_recovers_on_next_exec() -> anyhow::Result<()> { if !can_run_js_repl_runtime_tests().await { return Ok(()); } @@ -1772,9 +1784,24 @@ mod tests { Arc::clone(&state.child) }; JsReplManager::kill_kernel_child(&child, "test_crash").await; - tokio::time::sleep(Duration::from_millis(50)).await; + tokio::time::timeout(Duration::from_secs(1), async { + loop { + let cleared = { + let guard = manager.kernel.lock().await; + guard + .as_ref() + .is_none_or(|state| !Arc::ptr_eq(&state.child, &child)) + }; + if cleared { + return; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + }) + .await + .expect("host should clear dead kernel state promptly"); - let err = manager + let result = manager .execute( session, turn, @@ -1784,13 +1811,107 @@ mod tests { timeout_ms: Some(10_000), }, ) - .await - .expect_err("expected kernel failure after forced kill"); + .await?; + assert!(result.output.contains("after-kill")); + Ok(()) + } + + #[tokio::test] + async fn js_repl_uncaught_exception_returns_exec_error_and_recovers() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = crate::codex::make_session_and_context().await; + let session = Arc::new(session); + let turn = Arc::new(turn); + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + + manager + .execute( + Arc::clone(&session), + Arc::clone(&turn), + Arc::clone(&tracker), + JsReplArgs { + code: "console.log('warmup');".to_string(), + timeout_ms: Some(10_000), + }, + ) + .await?; + + let child = { + let guard = manager.kernel.lock().await; + let state = guard.as_ref().expect("kernel should exist after warmup"); + Arc::clone(&state.child) + }; + + let err = tokio::time::timeout( + Duration::from_secs(3), + manager.execute( + Arc::clone(&session), + Arc::clone(&turn), + Arc::clone(&tracker), + JsReplArgs { + code: "setTimeout(() => { throw new Error('boom'); }, 0);\nawait new Promise(() => {});".to_string(), + timeout_ms: Some(10_000), + }, + ), + ) + .await + .expect("uncaught exception should fail promptly") + .expect_err("expected uncaught exception to fail the exec"); let message = err.to_string(); - assert!(message.contains("js_repl diagnostics:")); - assert!(message.contains("\"reason\":\"write_failed\"")); - assert!(message.contains("\"kernel_status\":\"exited(")); + assert!(message.contains("js_repl kernel uncaught exception: boom")); + assert!(message.contains("kernel reset.")); + assert!(message.contains("Catch or handle async errors")); + assert!(!message.contains("js_repl kernel exited unexpectedly")); + + tokio::time::timeout(Duration::from_secs(1), async { + loop { + let exited = { + let mut child = child.lock().await; + child.try_wait()?.is_some() + }; + if exited { + return Ok::<(), anyhow::Error>(()); + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + }) + .await + .expect("uncaught exception should terminate the previous kernel process")?; + + tokio::time::timeout(Duration::from_secs(1), async { + loop { + let cleared = { + let guard = manager.kernel.lock().await; + guard + .as_ref() + .is_none_or(|state| !Arc::ptr_eq(&state.child, &child)) + }; + if cleared { + return; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + }) + .await + .expect("host should clear dead kernel state promptly"); + + let next = manager + .execute( + session, + turn, + tracker, + JsReplArgs { + code: "console.log('after reset');".to_string(), + timeout_ms: Some(10_000), + }, + ) + .await?; + assert!(next.output.contains("after reset")); Ok(()) }