From 3c3d3d1adcace87fca7b0e0b9f9b6bef4a8dff72 Mon Sep 17 00:00:00 2001 From: Michael Bolin Date: Sat, 6 Dec 2025 22:39:38 -0800 Subject: [PATCH] fix: add integration tests for codex-exec-mcp-server with execpolicy (#7617) This PR introduces integration tests that run [codex-shell-tool-mcp](https://www.npmjs.com/package/@openai/codex-shell-tool-mcp) as a user would. Note that this requires running our fork of Bash, so we introduce a [DotSlash](https://dotslash-cli.com/) file for `bash` so that we can run the integration tests on multiple platforms without having to check the binaries into the repository. (As noted in the DotSlash file, it is slightly more heavyweight than necessary, which may be worth addressing as disk space in CI is limited: https://github.com/openai/codex/pull/7678.) To start, this PR adds two tests: - `list_tools()` makes the `list_tools` request to the MCP server and verifies we get the expected response - `accept_elicitation_for_prompt_rule()` defines a `prefix_rule()` with `decision="prompt"` and verifies the elicitation flow works as expected Though the `accept_elicitation_for_prompt_rule()` test **only works on Linux**, as this PR reveals that there are currently issues when running the Bash fork in a read-only sandbox on Linux. This will have to be fixed in a follow-up PR. Incidentally, getting this test run to correctly on macOS also requires a recent fix we made to `brew` that hasn't hit a mainline release yet, so getting CI green in this PR required https://github.com/openai/codex/pull/7680. --- .github/workflows/rust-ci.yml | 13 ++ codex-rs/Cargo.lock | 16 ++ codex-rs/Cargo.toml | 1 + codex-rs/exec-server/Cargo.toml | 4 + codex-rs/exec-server/src/lib.rs | 3 + codex-rs/exec-server/src/posix.rs | 2 + codex-rs/exec-server/src/posix/mcp.rs | 2 +- codex-rs/exec-server/tests/all.rs | 3 + codex-rs/exec-server/tests/common/Cargo.toml | 16 ++ codex-rs/exec-server/tests/common/lib.rs | 167 ++++++++++++++++++ .../tests/suite/accept_elicitation.rs | 131 ++++++++++++++ codex-rs/exec-server/tests/suite/bash | 75 ++++++++ .../exec-server/tests/suite/list_tools.rs | 76 ++++++++ codex-rs/exec-server/tests/suite/mod.rs | 8 + 14 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 codex-rs/exec-server/tests/all.rs create mode 100644 codex-rs/exec-server/tests/common/Cargo.toml create mode 100644 codex-rs/exec-server/tests/common/lib.rs create mode 100644 codex-rs/exec-server/tests/suite/accept_elicitation.rs create mode 100755 codex-rs/exec-server/tests/suite/bash create mode 100644 codex-rs/exec-server/tests/suite/list_tools.rs create mode 100644 codex-rs/exec-server/tests/suite/mod.rs diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index 354b40392..0be45540c 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -407,6 +407,19 @@ jobs: brew upgrade brew --version + # Some integration tests rely on DotSlash being installed. + # See https://github.com/openai/codex/pull/7617. + - name: Install DotSlash + uses: facebook/install-dotslash@v2 + + - name: Pre-fetch DotSlash artifacts + # The Bash wrapper is not available on Windows. + if: ${{ !startsWith(matrix.runner, 'windows') }} + shell: bash + run: | + set -euo pipefail + dotslash -- fetch exec-server/tests/suite/bash + - uses: dtolnay/rust-toolchain@1.90 with: targets: ${{ matrix.target }} diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index b77cf01b0..ea1eec8f8 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1256,11 +1256,14 @@ name = "codex-exec-server" version = "0.0.0" dependencies = [ "anyhow", + "assert_cmd", "async-trait", "clap", "codex-core", "codex-execpolicy", + "exec_server_test_support", "libc", + "maplit", "path-absolutize", "pretty_assertions", "rmcp", @@ -1273,6 +1276,7 @@ dependencies = [ "tokio-util", "tracing", "tracing-subscriber", + "which", ] [[package]] @@ -2501,6 +2505,18 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "exec_server_test_support" +version = "0.0.0" +dependencies = [ + "anyhow", + "assert_cmd", + "codex-core", + "rmcp", + "serde_json", + "tokio", +] + [[package]] name = "eyre" version = "0.6.12" diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index d3d3c36c3..0587ec1d4 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -96,6 +96,7 @@ codex-utils-readiness = { path = "utils/readiness" } codex-utils-string = { path = "utils/string" } codex-windows-sandbox = { path = "windows-sandbox-rs" } core_test_support = { path = "core/tests/common" } +exec_server_test_support = { path = "exec-server/tests/common" } mcp-types = { path = "mcp-types" } mcp_test_support = { path = "mcp-server/tests/common" } diff --git a/codex-rs/exec-server/Cargo.toml b/codex-rs/exec-server/Cargo.toml index ab6ca80a1..a0bd53493 100644 --- a/codex-rs/exec-server/Cargo.toml +++ b/codex-rs/exec-server/Cargo.toml @@ -56,5 +56,9 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] } [dev-dependencies] +assert_cmd = { workspace = true } +exec_server_test_support = { workspace = true } +maplit = { workspace = true } pretty_assertions = { workspace = true } tempfile = { workspace = true } +which = { workspace = true } diff --git a/codex-rs/exec-server/src/lib.rs b/codex-rs/exec-server/src/lib.rs index adec09d4d..62f7bbccc 100644 --- a/codex-rs/exec-server/src/lib.rs +++ b/codex-rs/exec-server/src/lib.rs @@ -6,3 +6,6 @@ pub use posix::main_execve_wrapper; #[cfg(unix)] pub use posix::main_mcp_server; + +#[cfg(unix)] +pub use posix::ExecResult; diff --git a/codex-rs/exec-server/src/posix.rs b/codex-rs/exec-server/src/posix.rs index 16da5885f..1a4b0a0e1 100644 --- a/codex-rs/exec-server/src/posix.rs +++ b/codex-rs/exec-server/src/posix.rs @@ -82,6 +82,8 @@ mod mcp_escalation_policy; mod socket; mod stopwatch; +pub use mcp::ExecResult; + /// Default value of --execve option relative to the current executable. /// Note this must match the name of the binary as specified in Cargo.toml. const CODEX_EXECVE_WRAPPER_EXE_NAME: &str = "codex-execve-wrapper"; diff --git a/codex-rs/exec-server/src/posix/mcp.rs b/codex-rs/exec-server/src/posix/mcp.rs index bbbddc22e..1376d46b7 100644 --- a/codex-rs/exec-server/src/posix/mcp.rs +++ b/codex-rs/exec-server/src/posix/mcp.rs @@ -54,7 +54,7 @@ pub struct ExecParams { pub login: Option, } -#[derive(Debug, serde::Serialize, schemars::JsonSchema)] +#[derive(Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema)] pub struct ExecResult { pub exit_code: i32, pub output: String, diff --git a/codex-rs/exec-server/tests/all.rs b/codex-rs/exec-server/tests/all.rs new file mode 100644 index 000000000..7e136e4cc --- /dev/null +++ b/codex-rs/exec-server/tests/all.rs @@ -0,0 +1,3 @@ +// Single integration test binary that aggregates all test modules. +// The submodules live in `tests/suite/`. +mod suite; diff --git a/codex-rs/exec-server/tests/common/Cargo.toml b/codex-rs/exec-server/tests/common/Cargo.toml new file mode 100644 index 000000000..ba7d2af0a --- /dev/null +++ b/codex-rs/exec-server/tests/common/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "exec_server_test_support" +version.workspace = true +edition.workspace = true +license.workspace = true + +[lib] +path = "lib.rs" + +[dependencies] +assert_cmd = { workspace = true } +anyhow = { workspace = true } +codex-core = { workspace = true } +rmcp = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } diff --git a/codex-rs/exec-server/tests/common/lib.rs b/codex-rs/exec-server/tests/common/lib.rs new file mode 100644 index 000000000..c6df5c32c --- /dev/null +++ b/codex-rs/exec-server/tests/common/lib.rs @@ -0,0 +1,167 @@ +use codex_core::MCP_SANDBOX_STATE_NOTIFICATION; +use codex_core::SandboxState; +use codex_core::protocol::SandboxPolicy; +use rmcp::ClientHandler; +use rmcp::ErrorData as McpError; +use rmcp::RoleClient; +use rmcp::Service; +use rmcp::model::ClientCapabilities; +use rmcp::model::ClientInfo; +use rmcp::model::CreateElicitationRequestParam; +use rmcp::model::CreateElicitationResult; +use rmcp::model::CustomClientNotification; +use rmcp::model::ElicitationAction; +use rmcp::service::RunningService; +use rmcp::transport::ConfigureCommandExt; +use rmcp::transport::TokioChildProcess; +use serde_json::json; +use std::collections::HashSet; +use std::path::Path; +use std::path::PathBuf; +use std::process::Stdio; +use std::sync::Arc; +use std::sync::Mutex; +use tokio::process::Command; + +pub fn create_transport

(codex_home: P) -> anyhow::Result +where + P: AsRef, +{ + let mcp_executable = assert_cmd::Command::cargo_bin("codex-exec-mcp-server")?; + let execve_wrapper = assert_cmd::Command::cargo_bin("codex-execve-wrapper")?; + let bash = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("tests") + .join("suite") + .join("bash"); + + let transport = + TokioChildProcess::new(Command::new(mcp_executable.get_program()).configure(|cmd| { + cmd.arg("--bash").arg(bash); + cmd.arg("--execve").arg(execve_wrapper.get_program()); + cmd.env("CODEX_HOME", codex_home.as_ref()); + + // Important: pipe stdio so rmcp can speak JSON-RPC over stdin/stdout + cmd.stdin(Stdio::piped()); + cmd.stdout(Stdio::piped()); + + // Optional but very helpful while debugging: + cmd.stderr(Stdio::inherit()); + }))?; + + Ok(transport) +} + +pub async fn write_default_execpolicy

(policy: &str, codex_home: P) -> anyhow::Result<()> +where + P: AsRef, +{ + let policy_dir = codex_home.as_ref().join("policy"); + tokio::fs::create_dir_all(&policy_dir).await?; + tokio::fs::write(policy_dir.join("default.codexpolicy"), policy).await?; + Ok(()) +} + +pub async fn notify_readable_sandbox( + sandbox_cwd: P, + codex_linux_sandbox_exe: Option, + service: &RunningService, +) -> anyhow::Result<()> +where + P: AsRef, + S: Service + ClientHandler, +{ + let sandbox_state = SandboxState { + sandbox_policy: SandboxPolicy::ReadOnly, + codex_linux_sandbox_exe, + sandbox_cwd: sandbox_cwd.as_ref().to_path_buf(), + }; + send_sandbox_notification(sandbox_state, service).await +} + +pub async fn notify_writable_sandbox_only_one_folder( + writable_folder: P, + codex_linux_sandbox_exe: Option, + service: &RunningService, +) -> anyhow::Result<()> +where + P: AsRef, + S: Service + ClientHandler, +{ + let sandbox_state = SandboxState { + sandbox_policy: SandboxPolicy::WorkspaceWrite { + // Note that sandbox_cwd will already be included as a writable root + // when the sandbox policy is expanded. + writable_roots: vec![], + network_access: false, + // Disable writes to temp dir because this is a test, so + // writable_folder is likely also under /tmp and we want to be + // strict about what is writable. + exclude_tmpdir_env_var: true, + exclude_slash_tmp: true, + }, + codex_linux_sandbox_exe, + sandbox_cwd: writable_folder.as_ref().to_path_buf(), + }; + send_sandbox_notification(sandbox_state, service).await +} + +async fn send_sandbox_notification( + sandbox_state: SandboxState, + service: &RunningService, +) -> anyhow::Result<()> +where + S: Service + ClientHandler, +{ + let sandbox_state_notification = CustomClientNotification::new( + MCP_SANDBOX_STATE_NOTIFICATION, + Some(serde_json::to_value(sandbox_state)?), + ); + service + .send_notification(sandbox_state_notification.into()) + .await?; + Ok(()) +} + +pub struct InteractiveClient { + pub elicitations_to_accept: HashSet, + pub elicitation_requests: Arc>>, +} + +impl ClientHandler for InteractiveClient { + fn get_info(&self) -> ClientInfo { + let capabilities = ClientCapabilities::builder().enable_elicitation().build(); + ClientInfo { + capabilities, + ..Default::default() + } + } + + fn create_elicitation( + &self, + request: CreateElicitationRequestParam, + _context: rmcp::service::RequestContext, + ) -> impl std::future::Future> + Send + '_ + { + self.elicitation_requests + .lock() + .unwrap() + .push(request.clone()); + + let accept = self.elicitations_to_accept.contains(&request.message); + async move { + if accept { + Ok(CreateElicitationResult { + action: ElicitationAction::Accept, + content: Some(json!({ "approve": true })), + }) + } else { + Ok(CreateElicitationResult { + action: ElicitationAction::Decline, + content: None, + }) + } + } + } +} diff --git a/codex-rs/exec-server/tests/suite/accept_elicitation.rs b/codex-rs/exec-server/tests/suite/accept_elicitation.rs new file mode 100644 index 000000000..2093f9a57 --- /dev/null +++ b/codex-rs/exec-server/tests/suite/accept_elicitation.rs @@ -0,0 +1,131 @@ +#![allow(clippy::unwrap_used, clippy::expect_used)] +use std::borrow::Cow; +use std::sync::Arc; +use std::sync::Mutex; + +use anyhow::Result; +use codex_exec_server::ExecResult; +use exec_server_test_support::InteractiveClient; +use exec_server_test_support::create_transport; +use exec_server_test_support::notify_readable_sandbox; +use exec_server_test_support::write_default_execpolicy; +use maplit::hashset; +use pretty_assertions::assert_eq; +use rmcp::ServiceExt; +use rmcp::model::CallToolRequestParam; +use rmcp::model::CallToolResult; +use rmcp::model::CreateElicitationRequestParam; +use rmcp::model::object; +use serde_json::json; +use std::os::unix::fs::symlink; +use tempfile::TempDir; + +/// Verify that when using a read-only sandbox and an execpolicy that prompts, +/// the proper elicitation is sent. Upon auto-approving the elicitation, the +/// command should be run privileged outside the sandbox. +#[tokio::test(flavor = "current_thread")] +async fn accept_elicitation_for_prompt_rule() -> Result<()> { + // Configure a stdio transport that will launch the MCP server using + // $CODEX_HOME with an execpolicy that prompts for `git init` commands. + let codex_home = TempDir::new()?; + write_default_execpolicy( + r#" +# Create a rule with `decision = "prompt"` to exercise the elicitation flow. +prefix_rule( + pattern = ["git", "init"], + decision = "prompt", + match = [ + "git init ." + ], +) +"#, + codex_home.as_ref(), + ) + .await?; + let transport = create_transport(codex_home.as_ref())?; + + // Create an MCP client that approves expected elicitation messages. + let project_root = TempDir::new()?; + let git = which::which("git")?; + let project_root_path = project_root.path().canonicalize().unwrap(); + let expected_elicitation_message = format!( + "Allow agent to run `{} init .` in `{}`?", + git.display(), + project_root_path.display() + ); + let elicitation_requests: Arc>> = Default::default(); + let client = InteractiveClient { + elicitations_to_accept: hashset! { expected_elicitation_message.clone() }, + elicitation_requests: elicitation_requests.clone(), + }; + + // Start the MCP server. + let service: rmcp::service::RunningService = + client.serve(transport).await?; + + // Notify the MCP server about the current sandbox state before making any + // `shell` tool calls. + let linux_sandbox_exe_folder = TempDir::new()?; + let codex_linux_sandbox_exe = if cfg!(target_os = "linux") { + let codex_linux_sandbox_exe = linux_sandbox_exe_folder.path().join("codex-linux-sandbox"); + let codex_cli = assert_cmd::Command::cargo_bin("codex")? + .get_program() + .to_os_string(); + let codex_cli_path = std::path::PathBuf::from(codex_cli); + symlink(&codex_cli_path, &codex_linux_sandbox_exe)?; + Some(codex_linux_sandbox_exe) + } else { + None + }; + notify_readable_sandbox(&project_root_path, codex_linux_sandbox_exe, &service).await?; + + // Call the shell tool and verify that an elicitation was created and + // auto-approved. + let CallToolResult { + content, is_error, .. + } = service + .call_tool(CallToolRequestParam { + name: Cow::Borrowed("shell"), + arguments: Some(object(json!( + { + "command": "git init .", + "workdir": project_root_path.to_string_lossy(), + } + ))), + }) + .await?; + let tool_call_content = content + .first() + .expect("expected non-empty content") + .as_text() + .expect("expected text content"); + let ExecResult { + exit_code, output, .. + } = serde_json::from_str::(&tool_call_content.text)?; + let git_init_succeeded = format!( + "Initialized empty Git repository in {}/.git/\n", + project_root_path.display() + ); + // Normally, this would be an exact match, but it might include extra output + // if `git config set advice.defaultBranchName false` has not been set. + assert!( + output.contains(&git_init_succeeded), + "expected output `{output}` to contain `{git_init_succeeded}`" + ); + assert_eq!(exit_code, 0, "command should succeed"); + assert_eq!(is_error, Some(false), "command should succeed"); + assert!( + project_root_path.join(".git").is_dir(), + "git repo should exist" + ); + + let elicitation_messages = elicitation_requests + .lock() + .unwrap() + .iter() + .map(|r| r.message.clone()) + .collect::>(); + assert_eq!(vec![expected_elicitation_message], elicitation_messages); + + Ok(()) +} diff --git a/codex-rs/exec-server/tests/suite/bash b/codex-rs/exec-server/tests/suite/bash new file mode 100755 index 000000000..5f5d1e559 --- /dev/null +++ b/codex-rs/exec-server/tests/suite/bash @@ -0,0 +1,75 @@ +#!/usr/bin/env dotslash + +// This is an instance of the fork of Bash that we bundle with +// https://www.npmjs.com/package/@openai/codex-shell-tool-mcp. +// Fetching the prebuilt version via DotSlash makes it easier to write +// integration tests for the MCP server. +// +// TODO(mbolin): Currently, we use a .tgz artifact that includes binaries for +// multiple platforms, but we could save a bit of space by making arch-specific +// artifacts available in the GitHub releases and referencing those here. +{ + "name": "codex-bash", + "platforms": { + // macOS 13 builds (and therefore x86_64) were dropped in + // https://github.com/openai/codex/pull/7295, so we only provide an + // Apple Silicon build for now. + "macos-aarch64": { + "size": 37003612, + "hash": "blake3", + "digest": "d9cd5928c993b65c340507931c61c02bd6e9179933f8bf26a548482bb5fa53bb", + "format": "tar.gz", + "path": "package/vendor/aarch64-apple-darwin/bash/macos-15/bash", + "providers": [ + { + "url": "https://github.com/openai/codex/releases/download/rust-v0.65.0/codex-shell-tool-mcp-npm-0.65.0.tgz" + }, + { + "type": "github-release", + "repo": "openai/codex", + "tag": "rust-v0.65.0", + "name": "codex-shell-tool-mcp-npm-0.65.0.tgz" + } + ] + }, + // Note the `musl` parts of the Linux paths are misleading: the Bash + // binaries are actually linked against `glibc`, but the + // `codex-execve-wrapper` that invokes them is linked against `musl`. + "linux-x86_64": { + "size": 37003612, + "hash": "blake3", + "digest": "d9cd5928c993b65c340507931c61c02bd6e9179933f8bf26a548482bb5fa53bb", + "format": "tar.gz", + "path": "package/vendor/x86_64-unknown-linux-musl/bash/ubuntu-24.04/bash", + "providers": [ + { + "url": "https://github.com/openai/codex/releases/download/rust-v0.65.0/codex-shell-tool-mcp-npm-0.65.0.tgz" + }, + { + "type": "github-release", + "repo": "openai/codex", + "tag": "rust-v0.65.0", + "name": "codex-shell-tool-mcp-npm-0.65.0.tgz" + } + ] + }, + "linux-aarch64": { + "size": 37003612, + "hash": "blake3", + "digest": "d9cd5928c993b65c340507931c61c02bd6e9179933f8bf26a548482bb5fa53bb", + "format": "tar.gz", + "path": "package/vendor/aarch64-unknown-linux-musl/bash/ubuntu-24.04/bash", + "providers": [ + { + "url": "https://github.com/openai/codex/releases/download/rust-v0.65.0/codex-shell-tool-mcp-npm-0.65.0.tgz" + }, + { + "type": "github-release", + "repo": "openai/codex", + "tag": "rust-v0.65.0", + "name": "codex-shell-tool-mcp-npm-0.65.0.tgz" + } + ] + }, + } +} diff --git a/codex-rs/exec-server/tests/suite/list_tools.rs b/codex-rs/exec-server/tests/suite/list_tools.rs new file mode 100644 index 000000000..17505c761 --- /dev/null +++ b/codex-rs/exec-server/tests/suite/list_tools.rs @@ -0,0 +1,76 @@ +#![allow(clippy::unwrap_used, clippy::expect_used)] +use std::borrow::Cow; +use std::fs; +use std::sync::Arc; + +use anyhow::Result; +use exec_server_test_support::create_transport; +use pretty_assertions::assert_eq; +use rmcp::ServiceExt; +use rmcp::model::Tool; +use rmcp::model::object; +use serde_json::json; +use tempfile::TempDir; + +/// Verify the list_tools call to the MCP server returns the expected response. +#[tokio::test(flavor = "current_thread")] +async fn list_tools() -> Result<()> { + let codex_home = TempDir::new()?; + let policy_dir = codex_home.path().join("policy"); + fs::create_dir_all(&policy_dir)?; + fs::write( + policy_dir.join("default.codexpolicy"), + r#"prefix_rule(pattern=["ls"], decision="prompt")"#, + )?; + let transport = create_transport(codex_home.path())?; + + let service = ().serve(transport).await?; + let tools = service.list_tools(Default::default()).await?.tools; + assert_eq!( + vec![Tool { + name: Cow::Borrowed("shell"), + title: None, + description: Some(Cow::Borrowed( + "Runs a shell command and returns its output. You MUST provide the workdir as an absolute path." + )), + input_schema: Arc::new(object(json!({ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "properties": { + "command": { + "description": "The bash string to execute.", + "type": "string", + }, + "login": { + "description": "Launch Bash with -lc instead of -c: defaults to true.", + "nullable": true, + "type": "boolean", + }, + "timeout_ms": { + "description": "The timeout for the command in milliseconds.", + "format": "uint64", + "minimum": 0, + "nullable": true, + "type": "integer", + }, + "workdir": { + "description": "The working directory to execute the command in. Must be an absolute path.", + "type": "string", + }, + }, + "required": [ + "command", + "workdir", + ], + "title": "ExecParams", + "type": "object", + }))), + output_schema: None, + annotations: None, + icons: None, + meta: None + }], + tools + ); + + Ok(()) +} diff --git a/codex-rs/exec-server/tests/suite/mod.rs b/codex-rs/exec-server/tests/suite/mod.rs new file mode 100644 index 000000000..3a94f5857 --- /dev/null +++ b/codex-rs/exec-server/tests/suite/mod.rs @@ -0,0 +1,8 @@ +// TODO(mbolin): Get this test working on Linux. Currently, it fails with: +// +// > Error: Mcp error: -32603: sandbox error: sandbox denied exec error, +// > exit code: 1, stdout: , stderr: Error: failed to send handshake datagram +#[cfg(all(target_os = "macos", target_arch = "aarch64"))] +mod accept_elicitation; +#[cfg(any(all(target_os = "macos", target_arch = "aarch64"), target_os = "linux"))] +mod list_tools;