fix(js_repl): surface uncaught kernel errors and reset cleanly (#12636)

## Summary

Improve `js_repl` behavior when the Node kernel hits a process-level
failure (for example, an uncaught exception or unhandled Promise
rejection).

Instead of only surfacing a generic `js_repl kernel exited unexpectedly`
after stdout EOF, `js_repl` now returns a clearer exec error for the
active request, then resets the kernel cleanly.

## Why

Some sandbox-denied operations can trigger Node errors that become
process-level failures (for example, an unhandled EventEmitter `'error'`
event). In that case:

- the kernel process exits,
- the host sees stdout EOF,
- the user gets a generic kernel-exit error,
- and the next request can briefly race with stale kernel state.

This change improves that failure mode without monkeypatching Node APIs.

## Changes

### Kernel-side (`js_repl` Node process)
- Add process-level handlers for:
  - `uncaughtException`
  - `unhandledRejection`
- When one of these fires:
  - best-effort emit a normal `exec_result` error for the active exec
- include actionable guidance to catch/handle async errors (including
Promise rejections and EventEmitter `'error'` events)
  - exit intentionally so the host can reset/restart the kernel

### Host-side (`JsReplManager`)
- Clear dead kernel state as soon as the stdout reader observes
unexpected kernel exit/EOF.
- This lets the next `js_repl` exec start a fresh kernel instead of
hitting a stale broken-pipe path.

### Tests
- Add regression coverage for:
- uncaught async exception -> exec error + kernel recovery on next exec
- Update forced-kernel-exit test to validate recovery behavior (next
exec restarts cleanly)

## Impact

- Better user-facing error for kernel crashes caused by
uncaught/unhandled async failures.
- Cleaner recovery behavior after kernel exit.

## Validation

- `cargo test -p codex-core --lib
tools::js_repl::tests::js_repl_uncaught_exception_returns_exec_error_and_recovers
-- --exact`
- `cargo test -p codex-core --lib
tools::js_repl::tests::js_repl_forced_kernel_exit_recovers_on_next_exec
-- --exact`
- `just fmt`
This commit is contained in:
Curtis 'Fjord' Hawthorne 2026-02-24 17:12:02 -08:00 committed by GitHub
parent 5163850025
commit 63c2ac96cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 194 additions and 10 deletions

View file

@ -84,6 +84,8 @@ let previousModule = null;
/** @type {Binding[]} */
let previousBindings = [];
let cellCounter = 0;
let activeExecId = null;
let fatalExitScheduled = false;
const builtinModuleSet = new Set([
...builtinModules,
@ -395,6 +397,54 @@ function send(message) {
process.stdout.write("\n");
}
function formatErrorMessage(error) {
if (error && typeof error === "object" && "message" in error) {
return error.message ? String(error.message) : String(error);
}
return String(error);
}
function sendFatalExecResultSync(kind, error) {
if (!activeExecId) {
return;
}
const payload = {
type: "exec_result",
id: activeExecId,
ok: false,
output: "",
error: `js_repl kernel ${kind}: ${formatErrorMessage(error)}; kernel reset. Catch or handle async errors (including Promise rejections and EventEmitter 'error' events) to avoid kernel termination.`,
};
try {
fs.writeSync(process.stdout.fd, `${JSON.stringify(payload)}\n`);
} catch {
// Best effort only; the host will still surface stdout EOF diagnostics.
}
}
function scheduleFatalExit(kind, error) {
if (fatalExitScheduled) {
process.exitCode = 1;
return;
}
fatalExitScheduled = true;
sendFatalExecResultSync(kind, error);
try {
fs.writeSync(
process.stderr.fd,
`js_repl kernel ${kind}: ${formatErrorMessage(error)}\n`,
);
} catch {
// ignore
}
// The host will observe stdout EOF, reset kernel state, and restart on demand.
setImmediate(() => {
process.exit(1);
});
}
function formatLog(args) {
return args
.map((arg) =>
@ -431,6 +481,7 @@ function withCapturedConsole(ctx, fn) {
}
async function handleExec(message) {
activeExecId = message.id;
const tool = (toolName, args) => {
if (typeof toolName !== "string" || !toolName) {
return Promise.reject(new Error("codex.tool expects a tool name string"));
@ -527,6 +578,10 @@ async function handleExec(message) {
output: "",
error: error && error.message ? error.message : String(error),
});
} finally {
if (activeExecId === message.id) {
activeExecId = null;
}
}
}
@ -540,6 +595,14 @@ function handleToolResult(message) {
let queue = Promise.resolve();
process.on("uncaughtException", (error) => {
scheduleFatalExit("uncaught exception", error);
});
process.on("unhandledRejection", (reason) => {
scheduleFatalExit("unhandled rejection", reason);
});
const input = createInterface({ input: process.stdin, crlfDelay: Infinity });
input.on("line", (line) => {
if (!line.trim()) {

View file

@ -269,7 +269,7 @@ pub struct JsReplManager {
node_path: Option<PathBuf>,
node_module_dirs: Vec<PathBuf>,
tmp_dir: tempfile::TempDir,
kernel: Mutex<Option<KernelState>>,
kernel: Arc<Mutex<Option<KernelState>>>,
exec_lock: Arc<tokio::sync::Semaphore>,
exec_tool_calls: Arc<Mutex<HashMap<String, ExecToolCalls>>>,
}
@ -287,7 +287,7 @@ impl JsReplManager {
node_path,
node_module_dirs,
tmp_dir,
kernel: Mutex::new(None),
kernel: Arc::new(Mutex::new(None)),
exec_lock: Arc::new(tokio::sync::Semaphore::new(1)),
exec_tool_calls: Arc::new(Mutex::new(HashMap::new())),
});
@ -669,6 +669,7 @@ impl JsReplManager {
tokio::spawn(Self::read_stdout(
stdout,
Arc::clone(&child),
Arc::clone(&self.kernel),
Arc::clone(&recent_stderr),
Arc::clone(&pending_execs),
Arc::clone(&exec_contexts),
@ -811,6 +812,7 @@ impl JsReplManager {
async fn read_stdout(
stdout: tokio::process::ChildStdout,
child: Arc<Mutex<Child>>,
manager_kernel: Arc<Mutex<Option<KernelState>>>,
recent_stderr: Arc<Mutex<VecDeque<String>>>,
pending_execs: Arc<Mutex<HashMap<String, tokio::sync::oneshot::Sender<ExecResultMessage>>>>,
exec_contexts: Arc<Mutex<HashMap<String, ExecContext>>>,
@ -964,6 +966,16 @@ impl JsReplManager {
.clone()
.unwrap_or_else(|| "js_repl kernel exited unexpectedly".to_string());
{
let mut kernel = manager_kernel.lock().await;
let should_clear = kernel
.as_ref()
.is_some_and(|state| Arc::ptr_eq(&state.child, &child));
if should_clear {
kernel.take();
}
}
let mut pending = pending_execs.lock().await;
let pending_exec_ids = pending.keys().cloned().collect::<Vec<_>>();
for (_id, tx) in pending.drain() {
@ -1743,7 +1755,7 @@ mod tests {
}
#[tokio::test]
async fn js_repl_kernel_failure_includes_model_diagnostics() -> anyhow::Result<()> {
async fn js_repl_forced_kernel_exit_recovers_on_next_exec() -> anyhow::Result<()> {
if !can_run_js_repl_runtime_tests().await {
return Ok(());
}
@ -1772,9 +1784,24 @@ mod tests {
Arc::clone(&state.child)
};
JsReplManager::kill_kernel_child(&child, "test_crash").await;
tokio::time::sleep(Duration::from_millis(50)).await;
tokio::time::timeout(Duration::from_secs(1), async {
loop {
let cleared = {
let guard = manager.kernel.lock().await;
guard
.as_ref()
.is_none_or(|state| !Arc::ptr_eq(&state.child, &child))
};
if cleared {
return;
}
tokio::time::sleep(Duration::from_millis(10)).await;
}
})
.await
.expect("host should clear dead kernel state promptly");
let err = manager
let result = manager
.execute(
session,
turn,
@ -1784,13 +1811,107 @@ mod tests {
timeout_ms: Some(10_000),
},
)
.await
.expect_err("expected kernel failure after forced kill");
.await?;
assert!(result.output.contains("after-kill"));
Ok(())
}
#[tokio::test]
async fn js_repl_uncaught_exception_returns_exec_error_and_recovers() -> anyhow::Result<()> {
if !can_run_js_repl_runtime_tests().await {
return Ok(());
}
let (session, turn) = crate::codex::make_session_and_context().await;
let session = Arc::new(session);
let turn = Arc::new(turn);
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default()));
let manager = turn.js_repl.manager().await?;
manager
.execute(
Arc::clone(&session),
Arc::clone(&turn),
Arc::clone(&tracker),
JsReplArgs {
code: "console.log('warmup');".to_string(),
timeout_ms: Some(10_000),
},
)
.await?;
let child = {
let guard = manager.kernel.lock().await;
let state = guard.as_ref().expect("kernel should exist after warmup");
Arc::clone(&state.child)
};
let err = tokio::time::timeout(
Duration::from_secs(3),
manager.execute(
Arc::clone(&session),
Arc::clone(&turn),
Arc::clone(&tracker),
JsReplArgs {
code: "setTimeout(() => { throw new Error('boom'); }, 0);\nawait new Promise(() => {});".to_string(),
timeout_ms: Some(10_000),
},
),
)
.await
.expect("uncaught exception should fail promptly")
.expect_err("expected uncaught exception to fail the exec");
let message = err.to_string();
assert!(message.contains("js_repl diagnostics:"));
assert!(message.contains("\"reason\":\"write_failed\""));
assert!(message.contains("\"kernel_status\":\"exited("));
assert!(message.contains("js_repl kernel uncaught exception: boom"));
assert!(message.contains("kernel reset."));
assert!(message.contains("Catch or handle async errors"));
assert!(!message.contains("js_repl kernel exited unexpectedly"));
tokio::time::timeout(Duration::from_secs(1), async {
loop {
let exited = {
let mut child = child.lock().await;
child.try_wait()?.is_some()
};
if exited {
return Ok::<(), anyhow::Error>(());
}
tokio::time::sleep(Duration::from_millis(10)).await;
}
})
.await
.expect("uncaught exception should terminate the previous kernel process")?;
tokio::time::timeout(Duration::from_secs(1), async {
loop {
let cleared = {
let guard = manager.kernel.lock().await;
guard
.as_ref()
.is_none_or(|state| !Arc::ptr_eq(&state.child, &child))
};
if cleared {
return;
}
tokio::time::sleep(Duration::from_millis(10)).await;
}
})
.await
.expect("host should clear dead kernel state promptly");
let next = manager
.execute(
session,
turn,
tracker,
JsReplArgs {
code: "console.log('after reset');".to_string(),
timeout_ms: Some(10_000),
},
)
.await?;
assert!(next.output.contains("after reset"));
Ok(())
}