tests(js_repl): stabilize CI runtime test execution (#12407)
## Summary Stabilize `js_repl` runtime test setup in CI and move tool-facing `js_repl` behavior coverage into integration tests. This is a test/CI change only. No production `js_repl` behavior change is intended. ## Why - Bazel test sandboxes (especially on macOS) could resolve a different `node` than the one installed by `actions/setup-node`, which caused `js_repl` runtime/version failures. - `js_repl` runtime tests depend on platform-specific sandbox/test-harness behavior, so they need explicit gating in a base-stability commit. - Several tests in the `js_repl` unit test module were actually black-box/tool-level behavior tests and fit better in the integration suite. ## Changes - Add `actions/setup-node` to the Bazel and Rust `Tests` workflows, using the exact version pinned in the repo’s Node version file. - In Bazel (non-Windows), pass `CODEX_JS_REPL_NODE_PATH=$(which node)` into test env so `js_repl` uses the `actions/setup-node` runtime inside Bazel tests. - Add a new integration test suite for `js_repl` tool behavior and register it in the core integration test suite module. - Move black-box `js_repl` behavior tests into the integration suite (persistence/TLA, builtin tool invocation, recursive self-call rejection, `process` isolation, blocked builtin imports). - Keep white-box manager/kernel tests in the `js_repl` unit test module. - Gate `js_repl` runtime tests to run only on macOS and only when a usable Node runtime is available (skip on other platforms / missing Node in this commit). ## Impact - Reduces `js_repl` CI failures caused by Node resolution drift in Bazel. - Improves test organization by separating tool-facing behavior tests from white-box manager/kernel tests. - Keeps the base commit stable while expanding `js_repl` runtime coverage. #### [git stack](https://github.com/magus/git-stack-cli) - ✅ `1` https://github.com/openai/codex/pull/12372 - 👉 `2` https://github.com/openai/codex/pull/12407 - ⏳ `3` https://github.com/openai/codex/pull/12185 - ⏳ `4` https://github.com/openai/codex/pull/10673
This commit is contained in:
parent
16ca527c80
commit
8f3f2c3c02
5 changed files with 291 additions and 200 deletions
12
.github/workflows/bazel.yml
vendored
12
.github/workflows/bazel.yml
vendored
|
|
@ -47,6 +47,11 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Node.js for js_repl tests
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version-file: codex-rs/node-version.txt
|
||||
|
||||
# Some integration tests rely on DotSlash being installed.
|
||||
# See https://github.com/openai/codex/pull/7617.
|
||||
- name: Install DotSlash
|
||||
|
|
@ -156,6 +161,13 @@ jobs:
|
|||
--build_metadata=VISIBILITY=PUBLIC
|
||||
)
|
||||
|
||||
if [[ "${RUNNER_OS:-}" != "Windows" ]]; then
|
||||
# Bazel test sandboxes on macOS may resolve an older Homebrew `node`
|
||||
# before the `actions/setup-node` runtime on PATH.
|
||||
node_bin="$(which node)"
|
||||
bazel_args+=("--test_env=CODEX_JS_REPL_NODE_PATH=${node_bin}")
|
||||
fi
|
||||
|
||||
if [[ -n "${BUILDBUDDY_API_KEY:-}" ]]; then
|
||||
echo "BuildBuddy API key is available; using remote Bazel configuration."
|
||||
# Work around Bazel 9 remote repo contents cache / overlay materialization failures
|
||||
|
|
|
|||
4
.github/workflows/rust-ci.yml
vendored
4
.github/workflows/rust-ci.yml
vendored
|
|
@ -499,6 +499,10 @@ jobs:
|
|||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Set up Node.js for js_repl tests
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version-file: codex-rs/node-version.txt
|
||||
- name: Install Linux build dependencies
|
||||
if: ${{ runner.os == 'Linux' }}
|
||||
shell: bash
|
||||
|
|
|
|||
|
|
@ -1636,6 +1636,12 @@ mod tests {
|
|||
}
|
||||
|
||||
async fn can_run_js_repl_runtime_tests() -> bool {
|
||||
// These white-box runtime tests rely on the unit-test harness and are
|
||||
// only required on macOS. Linux uses the codex-linux-sandbox arg0
|
||||
// dispatch path, which is exercised in integration tests instead.
|
||||
if !cfg!(target_os = "macos") {
|
||||
return false;
|
||||
}
|
||||
if std::env::var_os("CODEX_SANDBOX").is_some() {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -1669,47 +1675,6 @@ mod tests {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_persists_top_level_bindings_and_supports_tla() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let (session, turn) = make_session_and_context().await;
|
||||
let session = Arc::new(session);
|
||||
let turn = Arc::new(turn);
|
||||
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default()));
|
||||
let manager = turn.js_repl.manager().await?;
|
||||
|
||||
let first = manager
|
||||
.execute(
|
||||
Arc::clone(&session),
|
||||
Arc::clone(&turn),
|
||||
Arc::clone(&tracker),
|
||||
JsReplArgs {
|
||||
code: "let x = await Promise.resolve(41); console.log(x);".to_string(),
|
||||
timeout_ms: Some(10_000),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
assert!(first.output.contains("41"));
|
||||
|
||||
let second = manager
|
||||
.execute(
|
||||
Arc::clone(&session),
|
||||
Arc::clone(&turn),
|
||||
Arc::clone(&tracker),
|
||||
JsReplArgs {
|
||||
code: "console.log(x + 1);".to_string(),
|
||||
timeout_ms: Some(10_000),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert!(second.output.contains("42"));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_timeout_does_not_deadlock() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
|
|
@ -1965,108 +1930,9 @@ mod tests {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_can_call_tools() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let (session, mut turn) = make_session_and_context().await;
|
||||
turn.approval_policy
|
||||
.set(AskForApproval::Never)
|
||||
.expect("test setup should allow updating approval policy");
|
||||
turn.sandbox_policy
|
||||
.set(SandboxPolicy::DangerFullAccess)
|
||||
.expect("test setup should allow updating sandbox policy");
|
||||
|
||||
let session = Arc::new(session);
|
||||
let turn = Arc::new(turn);
|
||||
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default()));
|
||||
let manager = turn.js_repl.manager().await?;
|
||||
|
||||
let shell = manager
|
||||
.execute(
|
||||
Arc::clone(&session),
|
||||
Arc::clone(&turn),
|
||||
Arc::clone(&tracker),
|
||||
JsReplArgs {
|
||||
code: "const shellOut = await codex.tool(\"shell_command\", { command: \"printf js_repl_shell_ok\" }); console.log(JSON.stringify(shellOut));".to_string(),
|
||||
timeout_ms: Some(15_000),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
assert!(shell.output.contains("js_repl_shell_ok"));
|
||||
|
||||
let tool = manager
|
||||
.execute(
|
||||
Arc::clone(&session),
|
||||
Arc::clone(&turn),
|
||||
Arc::clone(&tracker),
|
||||
JsReplArgs {
|
||||
code: "const toolOut = await codex.tool(\"list_mcp_resources\", {}); console.log(toolOut.type);".to_string(),
|
||||
timeout_ms: Some(15_000),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
assert!(tool.output.contains("function_call_output"));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_tool_call_rejects_recursive_js_repl_invocation() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let (session, mut turn) = make_session_and_context().await;
|
||||
turn.approval_policy
|
||||
.set(AskForApproval::Never)
|
||||
.expect("test setup should allow updating approval policy");
|
||||
turn.sandbox_policy
|
||||
.set(SandboxPolicy::DangerFullAccess)
|
||||
.expect("test setup should allow updating sandbox policy");
|
||||
|
||||
let session = Arc::new(session);
|
||||
let turn = Arc::new(turn);
|
||||
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default()));
|
||||
let manager = turn.js_repl.manager().await?;
|
||||
|
||||
let result = manager
|
||||
.execute(
|
||||
session,
|
||||
turn,
|
||||
tracker,
|
||||
JsReplArgs {
|
||||
code: r#"
|
||||
try {
|
||||
await codex.tool("js_repl", "console.log('recursive')");
|
||||
console.log("unexpected-success");
|
||||
} catch (err) {
|
||||
console.log(String(err));
|
||||
}
|
||||
"#
|
||||
.to_string(),
|
||||
timeout_ms: Some(15_000),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert!(
|
||||
result.output.contains("js_repl cannot invoke itself"),
|
||||
"expected recursion guard message, got output: {}",
|
||||
result.output
|
||||
);
|
||||
assert!(
|
||||
!result.output.contains("unexpected-success"),
|
||||
"recursive js_repl tool call unexpectedly succeeded: {}",
|
||||
result.output
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_waits_for_unawaited_tool_calls_before_completion() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await || cfg!(windows) {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
|
@ -2277,65 +2143,6 @@ console.log(out.type);
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_does_not_expose_process_global() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let (session, turn) = make_session_and_context().await;
|
||||
let session = Arc::new(session);
|
||||
let turn = Arc::new(turn);
|
||||
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default()));
|
||||
let manager = turn.js_repl.manager().await?;
|
||||
|
||||
let result = manager
|
||||
.execute(
|
||||
session,
|
||||
turn,
|
||||
tracker,
|
||||
JsReplArgs {
|
||||
code: "console.log(typeof process);".to_string(),
|
||||
timeout_ms: Some(10_000),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
assert!(result.output.contains("undefined"));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_blocks_sensitive_builtin_imports() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let (session, turn) = make_session_and_context().await;
|
||||
let session = Arc::new(session);
|
||||
let turn = Arc::new(turn);
|
||||
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default()));
|
||||
let manager = turn.js_repl.manager().await?;
|
||||
|
||||
let err = manager
|
||||
.execute(
|
||||
session,
|
||||
turn,
|
||||
tracker,
|
||||
JsReplArgs {
|
||||
code: "await import(\"node:process\");".to_string(),
|
||||
timeout_ms: Some(10_000),
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect_err("node:process import should be blocked");
|
||||
assert!(
|
||||
err.to_string()
|
||||
.contains("Importing module \"node:process\" is not allowed in js_repl")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn js_repl_prefers_env_node_module_dirs_over_config() -> anyhow::Result<()> {
|
||||
if !can_run_js_repl_runtime_tests().await {
|
||||
|
|
|
|||
267
codex-rs/core/tests/suite/js_repl.rs
Normal file
267
codex-rs/core/tests/suite/js_repl.rs
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
#![allow(clippy::expect_used, clippy::unwrap_used)]
|
||||
|
||||
use anyhow::Result;
|
||||
use codex_core::features::Feature;
|
||||
use core_test_support::responses;
|
||||
use core_test_support::responses::ResponseMock;
|
||||
use core_test_support::responses::ResponsesRequest;
|
||||
use core_test_support::responses::ev_assistant_message;
|
||||
use core_test_support::responses::ev_completed;
|
||||
use core_test_support::responses::ev_custom_tool_call;
|
||||
use core_test_support::responses::ev_response_created;
|
||||
use core_test_support::responses::sse;
|
||||
use core_test_support::skip_if_no_network;
|
||||
use core_test_support::test_codex::test_codex;
|
||||
use which::which;
|
||||
use wiremock::MockServer;
|
||||
|
||||
fn custom_tool_output_text_and_success(
|
||||
req: &ResponsesRequest,
|
||||
call_id: &str,
|
||||
) -> (String, Option<bool>) {
|
||||
let (output, success) = req
|
||||
.custom_tool_call_output_content_and_success(call_id)
|
||||
.expect("custom tool output should be present");
|
||||
(output.unwrap_or_default(), success)
|
||||
}
|
||||
|
||||
async fn run_js_repl_turn(
|
||||
server: &MockServer,
|
||||
prompt: &str,
|
||||
calls: &[(&str, &str)],
|
||||
) -> Result<ResponseMock> {
|
||||
let mut builder = test_codex().with_config(|config| {
|
||||
config.features.enable(Feature::JsRepl);
|
||||
});
|
||||
let test = builder.build(server).await?;
|
||||
|
||||
let mut first_events = vec![ev_response_created("resp-1")];
|
||||
for (call_id, js_input) in calls {
|
||||
first_events.push(ev_custom_tool_call(call_id, "js_repl", js_input));
|
||||
}
|
||||
first_events.push(ev_completed("resp-1"));
|
||||
responses::mount_sse_once(server, sse(first_events)).await;
|
||||
|
||||
let second_mock = responses::mount_sse_once(
|
||||
server,
|
||||
sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
|
||||
test.submit_turn(prompt).await?;
|
||||
Ok(second_mock)
|
||||
}
|
||||
|
||||
fn can_run_js_repl_runtime_tests() -> bool {
|
||||
// In the base CI-stabilization commit, skip these integration tests only
|
||||
// when the js_repl Node runtime is not available. A later stacked commit
|
||||
// removes this automatic skip and requires them everywhere.
|
||||
if let Some(path) = std::env::var_os("CODEX_JS_REPL_NODE_PATH")
|
||||
&& std::path::Path::new(&path).exists()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
which("node").is_ok()
|
||||
}
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn js_repl_persists_top_level_bindings_and_supports_tla() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
if !can_run_js_repl_runtime_tests() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let mut builder = test_codex().with_config(|config| {
|
||||
config.features.enable(Feature::JsRepl);
|
||||
});
|
||||
let test = builder.build(&server).await?;
|
||||
|
||||
responses::mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_custom_tool_call(
|
||||
"call-1",
|
||||
"js_repl",
|
||||
"let x = await Promise.resolve(41); console.log(x);",
|
||||
),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
let second_mock = responses::mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-2"),
|
||||
ev_custom_tool_call("call-2", "js_repl", "console.log(x + 1);"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
let third_mock = responses::mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-3"),
|
||||
]),
|
||||
)
|
||||
.await;
|
||||
|
||||
test.submit_turn("run js_repl twice").await?;
|
||||
|
||||
let req2 = second_mock.single_request();
|
||||
let (first_output, first_success) = custom_tool_output_text_and_success(&req2, "call-1");
|
||||
assert_ne!(
|
||||
first_success,
|
||||
Some(false),
|
||||
"first js_repl call failed unexpectedly: {first_output}"
|
||||
);
|
||||
assert!(first_output.contains("41"));
|
||||
|
||||
let req3 = third_mock.single_request();
|
||||
let (second_output, second_success) = custom_tool_output_text_and_success(&req3, "call-2");
|
||||
assert_ne!(
|
||||
second_success,
|
||||
Some(false),
|
||||
"second js_repl call failed unexpectedly: {second_output}"
|
||||
);
|
||||
assert!(second_output.contains("42"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn js_repl_can_invoke_builtin_tools() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
if !can_run_js_repl_runtime_tests() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let mock = run_js_repl_turn(
|
||||
&server,
|
||||
"use js_repl to call a tool",
|
||||
&[(
|
||||
"call-1",
|
||||
"const toolOut = await codex.tool(\"list_mcp_resources\", {}); console.log(toolOut.type);",
|
||||
)],
|
||||
)
|
||||
.await?;
|
||||
|
||||
let req = mock.single_request();
|
||||
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
|
||||
assert_ne!(
|
||||
success,
|
||||
Some(false),
|
||||
"js_repl call failed unexpectedly: {output}"
|
||||
);
|
||||
assert!(output.contains("function_call_output"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn js_repl_tool_call_rejects_recursive_js_repl_invocation() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
if !can_run_js_repl_runtime_tests() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let mock = run_js_repl_turn(
|
||||
&server,
|
||||
"use js_repl recursively",
|
||||
&[(
|
||||
"call-1",
|
||||
r#"
|
||||
try {
|
||||
await codex.tool("js_repl", "console.log('recursive')");
|
||||
console.log("unexpected-success");
|
||||
} catch (err) {
|
||||
console.log(String(err));
|
||||
}
|
||||
"#,
|
||||
)],
|
||||
)
|
||||
.await?;
|
||||
|
||||
let req = mock.single_request();
|
||||
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
|
||||
assert_ne!(
|
||||
success,
|
||||
Some(false),
|
||||
"js_repl call failed unexpectedly: {output}"
|
||||
);
|
||||
assert!(
|
||||
output.contains("js_repl cannot invoke itself"),
|
||||
"expected recursion guard message, got output: {output}"
|
||||
);
|
||||
assert!(
|
||||
!output.contains("unexpected-success"),
|
||||
"recursive js_repl call unexpectedly succeeded: {output}"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn js_repl_does_not_expose_process_global() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
if !can_run_js_repl_runtime_tests() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let mock = run_js_repl_turn(
|
||||
&server,
|
||||
"check process visibility",
|
||||
&[("call-1", "console.log(typeof process);")],
|
||||
)
|
||||
.await?;
|
||||
|
||||
let req = mock.single_request();
|
||||
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
|
||||
assert_ne!(
|
||||
success,
|
||||
Some(false),
|
||||
"js_repl call failed unexpectedly: {output}"
|
||||
);
|
||||
assert!(output.contains("undefined"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn js_repl_blocks_sensitive_builtin_imports() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
if !can_run_js_repl_runtime_tests() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let server = responses::start_mock_server().await;
|
||||
let mock = run_js_repl_turn(
|
||||
&server,
|
||||
"import a blocked module",
|
||||
&[("call-1", "await import(\"node:process\");")],
|
||||
)
|
||||
.await?;
|
||||
|
||||
let req = mock.single_request();
|
||||
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
|
||||
assert_ne!(
|
||||
success,
|
||||
Some(true),
|
||||
"blocked import unexpectedly succeeded: {output}"
|
||||
);
|
||||
assert!(output.contains("Importing module \"node:process\" is not allowed in js_repl"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -78,6 +78,7 @@ mod grep_files;
|
|||
mod hierarchical_agents;
|
||||
mod image_rollout;
|
||||
mod items;
|
||||
mod js_repl;
|
||||
mod json_result;
|
||||
mod list_dir;
|
||||
mod live_cli;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue