From 8f3f2c3c025e06acf9fd5e149810c2d08e224d84 Mon Sep 17 00:00:00 2001 From: Curtis 'Fjord' Hawthorne Date: Tue, 24 Feb 2026 21:04:34 -0800 Subject: [PATCH] tests(js_repl): stabilize CI runtime test execution (#12407) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Stabilize `js_repl` runtime test setup in CI and move tool-facing `js_repl` behavior coverage into integration tests. This is a test/CI change only. No production `js_repl` behavior change is intended. ## Why - Bazel test sandboxes (especially on macOS) could resolve a different `node` than the one installed by `actions/setup-node`, which caused `js_repl` runtime/version failures. - `js_repl` runtime tests depend on platform-specific sandbox/test-harness behavior, so they need explicit gating in a base-stability commit. - Several tests in the `js_repl` unit test module were actually black-box/tool-level behavior tests and fit better in the integration suite. ## Changes - Add `actions/setup-node` to the Bazel and Rust `Tests` workflows, using the exact version pinned in the repo’s Node version file. - In Bazel (non-Windows), pass `CODEX_JS_REPL_NODE_PATH=$(which node)` into test env so `js_repl` uses the `actions/setup-node` runtime inside Bazel tests. - Add a new integration test suite for `js_repl` tool behavior and register it in the core integration test suite module. - Move black-box `js_repl` behavior tests into the integration suite (persistence/TLA, builtin tool invocation, recursive self-call rejection, `process` isolation, blocked builtin imports). - Keep white-box manager/kernel tests in the `js_repl` unit test module. - Gate `js_repl` runtime tests to run only on macOS and only when a usable Node runtime is available (skip on other platforms / missing Node in this commit). ## Impact - Reduces `js_repl` CI failures caused by Node resolution drift in Bazel. - Improves test organization by separating tool-facing behavior tests from white-box manager/kernel tests. - Keeps the base commit stable while expanding `js_repl` runtime coverage. #### [git stack](https://github.com/magus/git-stack-cli) - ✅ `1` https://github.com/openai/codex/pull/12372 - 👉 `2` https://github.com/openai/codex/pull/12407 - ⏳ `3` https://github.com/openai/codex/pull/12185 - ⏳ `4` https://github.com/openai/codex/pull/10673 --- .github/workflows/bazel.yml | 12 ++ .github/workflows/rust-ci.yml | 4 + codex-rs/core/src/tools/js_repl/mod.rs | 207 +------------------ codex-rs/core/tests/suite/js_repl.rs | 267 +++++++++++++++++++++++++ codex-rs/core/tests/suite/mod.rs | 1 + 5 files changed, 291 insertions(+), 200 deletions(-) create mode 100644 codex-rs/core/tests/suite/js_repl.rs diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml index 1a0a69035..539e9cb8e 100644 --- a/.github/workflows/bazel.yml +++ b/.github/workflows/bazel.yml @@ -47,6 +47,11 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Set up Node.js for js_repl tests + uses: actions/setup-node@v6 + with: + node-version-file: codex-rs/node-version.txt + # Some integration tests rely on DotSlash being installed. # See https://github.com/openai/codex/pull/7617. - name: Install DotSlash @@ -156,6 +161,13 @@ jobs: --build_metadata=VISIBILITY=PUBLIC ) + if [[ "${RUNNER_OS:-}" != "Windows" ]]; then + # Bazel test sandboxes on macOS may resolve an older Homebrew `node` + # before the `actions/setup-node` runtime on PATH. + node_bin="$(which node)" + bazel_args+=("--test_env=CODEX_JS_REPL_NODE_PATH=${node_bin}") + fi + if [[ -n "${BUILDBUDDY_API_KEY:-}" ]]; then echo "BuildBuddy API key is available; using remote Bazel configuration." # Work around Bazel 9 remote repo contents cache / overlay materialization failures diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index 8e8825e9f..3122a965a 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -499,6 +499,10 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Set up Node.js for js_repl tests + uses: actions/setup-node@v6 + with: + node-version-file: codex-rs/node-version.txt - name: Install Linux build dependencies if: ${{ runner.os == 'Linux' }} shell: bash diff --git a/codex-rs/core/src/tools/js_repl/mod.rs b/codex-rs/core/src/tools/js_repl/mod.rs index 18695ab88..3ae9cff62 100644 --- a/codex-rs/core/src/tools/js_repl/mod.rs +++ b/codex-rs/core/src/tools/js_repl/mod.rs @@ -1636,6 +1636,12 @@ mod tests { } async fn can_run_js_repl_runtime_tests() -> bool { + // These white-box runtime tests rely on the unit-test harness and are + // only required on macOS. Linux uses the codex-linux-sandbox arg0 + // dispatch path, which is exercised in integration tests instead. + if !cfg!(target_os = "macos") { + return false; + } if std::env::var_os("CODEX_SANDBOX").is_some() { return false; } @@ -1669,47 +1675,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn js_repl_persists_top_level_bindings_and_supports_tla() -> anyhow::Result<()> { - if !can_run_js_repl_runtime_tests().await { - return Ok(()); - } - - let (session, turn) = make_session_and_context().await; - let session = Arc::new(session); - let turn = Arc::new(turn); - let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); - let manager = turn.js_repl.manager().await?; - - let first = manager - .execute( - Arc::clone(&session), - Arc::clone(&turn), - Arc::clone(&tracker), - JsReplArgs { - code: "let x = await Promise.resolve(41); console.log(x);".to_string(), - timeout_ms: Some(10_000), - }, - ) - .await?; - assert!(first.output.contains("41")); - - let second = manager - .execute( - Arc::clone(&session), - Arc::clone(&turn), - Arc::clone(&tracker), - JsReplArgs { - code: "console.log(x + 1);".to_string(), - timeout_ms: Some(10_000), - }, - ) - .await?; - - assert!(second.output.contains("42")); - Ok(()) - } - #[tokio::test] async fn js_repl_timeout_does_not_deadlock() -> anyhow::Result<()> { if !can_run_js_repl_runtime_tests().await { @@ -1965,108 +1930,9 @@ mod tests { Ok(()) } - #[tokio::test] - async fn js_repl_can_call_tools() -> anyhow::Result<()> { - if !can_run_js_repl_runtime_tests().await { - return Ok(()); - } - - let (session, mut turn) = make_session_and_context().await; - turn.approval_policy - .set(AskForApproval::Never) - .expect("test setup should allow updating approval policy"); - turn.sandbox_policy - .set(SandboxPolicy::DangerFullAccess) - .expect("test setup should allow updating sandbox policy"); - - let session = Arc::new(session); - let turn = Arc::new(turn); - let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); - let manager = turn.js_repl.manager().await?; - - let shell = manager - .execute( - Arc::clone(&session), - Arc::clone(&turn), - Arc::clone(&tracker), - JsReplArgs { - code: "const shellOut = await codex.tool(\"shell_command\", { command: \"printf js_repl_shell_ok\" }); console.log(JSON.stringify(shellOut));".to_string(), - timeout_ms: Some(15_000), - }, - ) - .await?; - assert!(shell.output.contains("js_repl_shell_ok")); - - let tool = manager - .execute( - Arc::clone(&session), - Arc::clone(&turn), - Arc::clone(&tracker), - JsReplArgs { - code: "const toolOut = await codex.tool(\"list_mcp_resources\", {}); console.log(toolOut.type);".to_string(), - timeout_ms: Some(15_000), - }, - ) - .await?; - assert!(tool.output.contains("function_call_output")); - Ok(()) - } - - #[tokio::test] - async fn js_repl_tool_call_rejects_recursive_js_repl_invocation() -> anyhow::Result<()> { - if !can_run_js_repl_runtime_tests().await { - return Ok(()); - } - - let (session, mut turn) = make_session_and_context().await; - turn.approval_policy - .set(AskForApproval::Never) - .expect("test setup should allow updating approval policy"); - turn.sandbox_policy - .set(SandboxPolicy::DangerFullAccess) - .expect("test setup should allow updating sandbox policy"); - - let session = Arc::new(session); - let turn = Arc::new(turn); - let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); - let manager = turn.js_repl.manager().await?; - - let result = manager - .execute( - session, - turn, - tracker, - JsReplArgs { - code: r#" -try { - await codex.tool("js_repl", "console.log('recursive')"); - console.log("unexpected-success"); -} catch (err) { - console.log(String(err)); -} -"# - .to_string(), - timeout_ms: Some(15_000), - }, - ) - .await?; - - assert!( - result.output.contains("js_repl cannot invoke itself"), - "expected recursion guard message, got output: {}", - result.output - ); - assert!( - !result.output.contains("unexpected-success"), - "recursive js_repl tool call unexpectedly succeeded: {}", - result.output - ); - Ok(()) - } - #[tokio::test] async fn js_repl_waits_for_unawaited_tool_calls_before_completion() -> anyhow::Result<()> { - if !can_run_js_repl_runtime_tests().await || cfg!(windows) { + if !can_run_js_repl_runtime_tests().await { return Ok(()); } @@ -2277,65 +2143,6 @@ console.log(out.type); Ok(()) } - - #[tokio::test] - async fn js_repl_does_not_expose_process_global() -> anyhow::Result<()> { - if !can_run_js_repl_runtime_tests().await { - return Ok(()); - } - - let (session, turn) = make_session_and_context().await; - let session = Arc::new(session); - let turn = Arc::new(turn); - let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); - let manager = turn.js_repl.manager().await?; - - let result = manager - .execute( - session, - turn, - tracker, - JsReplArgs { - code: "console.log(typeof process);".to_string(), - timeout_ms: Some(10_000), - }, - ) - .await?; - assert!(result.output.contains("undefined")); - Ok(()) - } - - #[tokio::test] - async fn js_repl_blocks_sensitive_builtin_imports() -> anyhow::Result<()> { - if !can_run_js_repl_runtime_tests().await { - return Ok(()); - } - - let (session, turn) = make_session_and_context().await; - let session = Arc::new(session); - let turn = Arc::new(turn); - let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); - let manager = turn.js_repl.manager().await?; - - let err = manager - .execute( - session, - turn, - tracker, - JsReplArgs { - code: "await import(\"node:process\");".to_string(), - timeout_ms: Some(10_000), - }, - ) - .await - .expect_err("node:process import should be blocked"); - assert!( - err.to_string() - .contains("Importing module \"node:process\" is not allowed in js_repl") - ); - Ok(()) - } - #[tokio::test] async fn js_repl_prefers_env_node_module_dirs_over_config() -> anyhow::Result<()> { if !can_run_js_repl_runtime_tests().await { diff --git a/codex-rs/core/tests/suite/js_repl.rs b/codex-rs/core/tests/suite/js_repl.rs new file mode 100644 index 000000000..364b07726 --- /dev/null +++ b/codex-rs/core/tests/suite/js_repl.rs @@ -0,0 +1,267 @@ +#![allow(clippy::expect_used, clippy::unwrap_used)] + +use anyhow::Result; +use codex_core::features::Feature; +use core_test_support::responses; +use core_test_support::responses::ResponseMock; +use core_test_support::responses::ResponsesRequest; +use core_test_support::responses::ev_assistant_message; +use core_test_support::responses::ev_completed; +use core_test_support::responses::ev_custom_tool_call; +use core_test_support::responses::ev_response_created; +use core_test_support::responses::sse; +use core_test_support::skip_if_no_network; +use core_test_support::test_codex::test_codex; +use which::which; +use wiremock::MockServer; + +fn custom_tool_output_text_and_success( + req: &ResponsesRequest, + call_id: &str, +) -> (String, Option) { + let (output, success) = req + .custom_tool_call_output_content_and_success(call_id) + .expect("custom tool output should be present"); + (output.unwrap_or_default(), success) +} + +async fn run_js_repl_turn( + server: &MockServer, + prompt: &str, + calls: &[(&str, &str)], +) -> Result { + let mut builder = test_codex().with_config(|config| { + config.features.enable(Feature::JsRepl); + }); + let test = builder.build(server).await?; + + let mut first_events = vec![ev_response_created("resp-1")]; + for (call_id, js_input) in calls { + first_events.push(ev_custom_tool_call(call_id, "js_repl", js_input)); + } + first_events.push(ev_completed("resp-1")); + responses::mount_sse_once(server, sse(first_events)).await; + + let second_mock = responses::mount_sse_once( + server, + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ) + .await; + + test.submit_turn(prompt).await?; + Ok(second_mock) +} + +fn can_run_js_repl_runtime_tests() -> bool { + // In the base CI-stabilization commit, skip these integration tests only + // when the js_repl Node runtime is not available. A later stacked commit + // removes this automatic skip and requires them everywhere. + if let Some(path) = std::env::var_os("CODEX_JS_REPL_NODE_PATH") + && std::path::Path::new(&path).exists() + { + return true; + } + which("node").is_ok() +} +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn js_repl_persists_top_level_bindings_and_supports_tla() -> Result<()> { + skip_if_no_network!(Ok(())); + + if !can_run_js_repl_runtime_tests() { + return Ok(()); + } + + let server = responses::start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + config.features.enable(Feature::JsRepl); + }); + let test = builder.build(&server).await?; + + responses::mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-1"), + ev_custom_tool_call( + "call-1", + "js_repl", + "let x = await Promise.resolve(41); console.log(x);", + ), + ev_completed("resp-1"), + ]), + ) + .await; + let second_mock = responses::mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-2"), + ev_custom_tool_call("call-2", "js_repl", "console.log(x + 1);"), + ev_completed("resp-2"), + ]), + ) + .await; + let third_mock = responses::mount_sse_once( + &server, + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-3"), + ]), + ) + .await; + + test.submit_turn("run js_repl twice").await?; + + let req2 = second_mock.single_request(); + let (first_output, first_success) = custom_tool_output_text_and_success(&req2, "call-1"); + assert_ne!( + first_success, + Some(false), + "first js_repl call failed unexpectedly: {first_output}" + ); + assert!(first_output.contains("41")); + + let req3 = third_mock.single_request(); + let (second_output, second_success) = custom_tool_output_text_and_success(&req3, "call-2"); + assert_ne!( + second_success, + Some(false), + "second js_repl call failed unexpectedly: {second_output}" + ); + assert!(second_output.contains("42")); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn js_repl_can_invoke_builtin_tools() -> Result<()> { + skip_if_no_network!(Ok(())); + + if !can_run_js_repl_runtime_tests() { + return Ok(()); + } + + let server = responses::start_mock_server().await; + let mock = run_js_repl_turn( + &server, + "use js_repl to call a tool", + &[( + "call-1", + "const toolOut = await codex.tool(\"list_mcp_resources\", {}); console.log(toolOut.type);", + )], + ) + .await?; + + let req = mock.single_request(); + let (output, success) = custom_tool_output_text_and_success(&req, "call-1"); + assert_ne!( + success, + Some(false), + "js_repl call failed unexpectedly: {output}" + ); + assert!(output.contains("function_call_output")); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn js_repl_tool_call_rejects_recursive_js_repl_invocation() -> Result<()> { + skip_if_no_network!(Ok(())); + + if !can_run_js_repl_runtime_tests() { + return Ok(()); + } + + let server = responses::start_mock_server().await; + let mock = run_js_repl_turn( + &server, + "use js_repl recursively", + &[( + "call-1", + r#" +try { + await codex.tool("js_repl", "console.log('recursive')"); + console.log("unexpected-success"); +} catch (err) { + console.log(String(err)); +} +"#, + )], + ) + .await?; + + let req = mock.single_request(); + let (output, success) = custom_tool_output_text_and_success(&req, "call-1"); + assert_ne!( + success, + Some(false), + "js_repl call failed unexpectedly: {output}" + ); + assert!( + output.contains("js_repl cannot invoke itself"), + "expected recursion guard message, got output: {output}" + ); + assert!( + !output.contains("unexpected-success"), + "recursive js_repl call unexpectedly succeeded: {output}" + ); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn js_repl_does_not_expose_process_global() -> Result<()> { + skip_if_no_network!(Ok(())); + + if !can_run_js_repl_runtime_tests() { + return Ok(()); + } + + let server = responses::start_mock_server().await; + let mock = run_js_repl_turn( + &server, + "check process visibility", + &[("call-1", "console.log(typeof process);")], + ) + .await?; + + let req = mock.single_request(); + let (output, success) = custom_tool_output_text_and_success(&req, "call-1"); + assert_ne!( + success, + Some(false), + "js_repl call failed unexpectedly: {output}" + ); + assert!(output.contains("undefined")); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn js_repl_blocks_sensitive_builtin_imports() -> Result<()> { + skip_if_no_network!(Ok(())); + + if !can_run_js_repl_runtime_tests() { + return Ok(()); + } + + let server = responses::start_mock_server().await; + let mock = run_js_repl_turn( + &server, + "import a blocked module", + &[("call-1", "await import(\"node:process\");")], + ) + .await?; + + let req = mock.single_request(); + let (output, success) = custom_tool_output_text_and_success(&req, "call-1"); + assert_ne!( + success, + Some(true), + "blocked import unexpectedly succeeded: {output}" + ); + assert!(output.contains("Importing module \"node:process\" is not allowed in js_repl")); + + Ok(()) +} diff --git a/codex-rs/core/tests/suite/mod.rs b/codex-rs/core/tests/suite/mod.rs index f3b64e1e7..1f20aeee3 100644 --- a/codex-rs/core/tests/suite/mod.rs +++ b/codex-rs/core/tests/suite/mod.rs @@ -78,6 +78,7 @@ mod grep_files; mod hierarchical_agents; mod image_rollout; mod items; +mod js_repl; mod json_result; mod list_dir; mod live_cli;