diff --git a/AGENTS.md b/AGENTS.md index aaebd0dfd..f9f04c5b1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -75,6 +75,7 @@ If you don’t have the tool: ### Test assertions - Tests should use pretty_assertions::assert_eq for clearer diffs. Import this at the top of the test module if it isn't already. +- Prefer deep equals comparisons whenever possible. Perform `assert_eq!()` on entire objects, rather than individual fields. ### Integration tests (core) diff --git a/codex-rs/core/src/shell.rs b/codex-rs/core/src/shell.rs index ac115facb..2338f41cd 100644 --- a/codex-rs/core/src/shell.rs +++ b/codex-rs/core/src/shell.rs @@ -408,6 +408,48 @@ mod tests { } } + #[test] + fn derive_exec_args() { + let test_bash_shell = Shell { + shell_type: ShellType::Bash, + shell_path: PathBuf::from("/bin/bash"), + }; + assert_eq!( + test_bash_shell.derive_exec_args("echo hello", false), + vec!["/bin/bash", "-c", "echo hello"] + ); + assert_eq!( + test_bash_shell.derive_exec_args("echo hello", true), + vec!["/bin/bash", "-lc", "echo hello"] + ); + + let test_zsh_shell = Shell { + shell_type: ShellType::Zsh, + shell_path: PathBuf::from("/bin/zsh"), + }; + assert_eq!( + test_zsh_shell.derive_exec_args("echo hello", false), + vec!["/bin/zsh", "-c", "echo hello"] + ); + assert_eq!( + test_zsh_shell.derive_exec_args("echo hello", true), + vec!["/bin/zsh", "-lc", "echo hello"] + ); + + let test_powershell_shell = Shell { + shell_type: ShellType::PowerShell, + shell_path: PathBuf::from("pwsh.exe"), + }; + assert_eq!( + test_powershell_shell.derive_exec_args("echo hello", false), + vec!["pwsh.exe", "-NoProfile", "-Command", "echo hello"] + ); + assert_eq!( + test_powershell_shell.derive_exec_args("echo hello", true), + vec!["pwsh.exe", "-Command", "echo hello"] + ); + } + #[tokio::test] async fn test_current_shell_detects_zsh() { let shell = Command::new("sh") diff --git a/codex-rs/core/src/tools/handlers/shell.rs b/codex-rs/core/src/tools/handlers/shell.rs index cd05d126b..c3ef590e1 100644 --- a/codex-rs/core/src/tools/handlers/shell.rs +++ b/codex-rs/core/src/tools/handlers/shell.rs @@ -49,8 +49,7 @@ impl ShellCommandHandler { turn_context: &TurnContext, ) -> ExecParams { let shell = session.user_shell(); - let use_login_shell = true; - let command = shell.derive_exec_args(¶ms.command, use_login_shell); + let command = shell.derive_exec_args(¶ms.command, params.login.unwrap_or(true)); ExecParams { command, @@ -276,9 +275,15 @@ impl ShellHandler { mod tests { use std::path::PathBuf; + use codex_protocol::models::ShellCommandToolCallParams; + use pretty_assertions::assert_eq; + + use crate::codex::make_session_and_context; + use crate::exec_env::create_env; use crate::is_safe_command::is_known_safe_command; use crate::shell::Shell; use crate::shell::ShellType; + use crate::tools::handlers::ShellCommandHandler; /// The logic for is_known_safe_command() has heuristics for known shells, /// so we must ensure the commands generated by [ShellCommandHandler] can be @@ -312,4 +317,43 @@ mod tests { &shell.derive_exec_args(command, /* use_login_shell */ false) )); } + + #[test] + fn shell_command_handler_to_exec_params_uses_session_shell_and_turn_context() { + let (session, turn_context) = make_session_and_context(); + + let command = "echo hello".to_string(); + let workdir = Some("subdir".to_string()); + let login = None; + let timeout_ms = Some(1234); + let with_escalated_permissions = Some(true); + let justification = Some("because tests".to_string()); + + let expected_command = session.user_shell().derive_exec_args(&command, true); + let expected_cwd = turn_context.resolve_path(workdir.clone()); + let expected_env = create_env(&turn_context.shell_environment_policy); + + let params = ShellCommandToolCallParams { + command, + workdir, + login, + timeout_ms, + with_escalated_permissions, + justification: justification.clone(), + }; + + let exec_params = ShellCommandHandler::to_exec_params(params, &session, &turn_context); + + // ExecParams cannot derive Eq due to the CancellationToken field, so we manually compare the fields. + assert_eq!(exec_params.command, expected_command); + assert_eq!(exec_params.cwd, expected_cwd); + assert_eq!(exec_params.env, expected_env); + assert_eq!(exec_params.expiration.timeout_ms(), timeout_ms); + assert_eq!( + exec_params.with_escalated_permissions, + with_escalated_permissions + ); + assert_eq!(exec_params.justification, justification); + assert_eq!(exec_params.arg0, None); + } } diff --git a/codex-rs/core/src/tools/spec.rs b/codex-rs/core/src/tools/spec.rs index a36f54a6b..e72becd88 100644 --- a/codex-rs/core/src/tools/spec.rs +++ b/codex-rs/core/src/tools/spec.rs @@ -331,6 +331,15 @@ fn create_shell_command_tool() -> ToolSpec { description: Some("The working directory to execute the command in".to_string()), }, ); + properties.insert( + "login".to_string(), + JsonSchema::Boolean { + description: Some( + "Whether to run the shell with login shell semantics. Defaults to true." + .to_string(), + ), + }, + ); properties.insert( "timeout_ms".to_string(), JsonSchema::Number { diff --git a/codex-rs/core/tests/common/lib.rs b/codex-rs/core/tests/common/lib.rs index e7b1e71ef..2c8c28d71 100644 --- a/codex-rs/core/tests/common/lib.rs +++ b/codex-rs/core/tests/common/lib.rs @@ -369,3 +369,15 @@ macro_rules! skip_if_no_network { } }}; } + +#[macro_export] +macro_rules! skip_if_windows { + ($return_value:expr $(,)?) => {{ + if cfg!(target_os = "windows") { + println!( + "Skipping test because it cannot execute when network is disabled in a Codex sandbox." + ); + return $return_value; + } + }}; +} diff --git a/codex-rs/core/tests/suite/mod.rs b/codex-rs/core/tests/suite/mod.rs index 86f417801..e2d78004a 100644 --- a/codex-rs/core/tests/suite/mod.rs +++ b/codex-rs/core/tests/suite/mod.rs @@ -46,6 +46,7 @@ mod review; mod rmcp_client; mod rollout_list_find; mod seatbelt; +mod shell_command; mod shell_serialization; mod stream_error_allows_next_turn; mod stream_no_completed; diff --git a/codex-rs/core/tests/suite/shell_command.rs b/codex-rs/core/tests/suite/shell_command.rs new file mode 100644 index 000000000..10e972b3d --- /dev/null +++ b/codex-rs/core/tests/suite/shell_command.rs @@ -0,0 +1,174 @@ +use anyhow::Result; +use core_test_support::assert_regex_match; +use core_test_support::responses::ev_assistant_message; +use core_test_support::responses::ev_completed; +use core_test_support::responses::ev_function_call; +use core_test_support::responses::ev_response_created; +use core_test_support::responses::mount_sse_sequence; +use core_test_support::responses::sse; +use core_test_support::skip_if_no_network; +use core_test_support::skip_if_windows; +use core_test_support::test_codex::TestCodexBuilder; +use core_test_support::test_codex::TestCodexHarness; +use core_test_support::test_codex::test_codex; +use serde_json::json; + +fn shell_responses(call_id: &str, command: &str, login: Option) -> Vec { + let args = json!({ + "command": command, + "timeout_ms": 2_000, + "login": login, + }); + + #[allow(clippy::expect_used)] + let arguments = serde_json::to_string(&args).expect("serialize shell command arguments"); + + vec![ + sse(vec![ + ev_response_created("resp-1"), + ev_function_call(call_id, "shell_command", &arguments), + ev_completed("resp-1"), + ]), + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ] +} + +async fn shell_command_harness_with( + configure: impl FnOnce(TestCodexBuilder) -> TestCodexBuilder, +) -> Result { + let builder = configure(test_codex()).with_config(|config| { + config.include_apply_patch_tool = true; + }); + TestCodexHarness::with_builder(builder).await +} + +async fn mount_shell_responses( + harness: &TestCodexHarness, + call_id: &str, + command: &str, + login: Option, +) { + mount_sse_sequence(harness.server(), shell_responses(call_id, command, login)).await; +} + +fn assert_shell_command_output(output: &str, expected: &str) -> Result<()> { + let normalized_output = output + .replace("\r\n", "\n") + .replace('\r', "\n") + .trim_end_matches('\n') + .to_string(); + + let expected_pattern = format!( + r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nOutput:\n{expected}\n?$" + ); + + assert_regex_match(&expected_pattern, &normalized_output); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shell_command_works() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let harness = shell_command_harness_with(|builder| builder.with_model("gpt-5.1")).await?; + + let call_id = "shell-command-call"; + mount_shell_responses(&harness, call_id, "echo 'hello, world'", None).await; + harness.submit("run the echo command").await?; + + let output = harness.function_call_stdout(call_id).await; + assert_shell_command_output(&output, "hello, world")?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn output_with_login() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let harness = shell_command_harness_with(|builder| builder.with_model("gpt-5.1")).await?; + + let call_id = "shell-command-call-login-true"; + mount_shell_responses(&harness, call_id, "echo 'hello, world'", Some(true)).await; + harness.submit("run the echo command with login").await?; + + let output = harness.function_call_stdout(call_id).await; + assert_shell_command_output(&output, "hello, world")?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn output_without_login() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let harness = shell_command_harness_with(|builder| builder.with_model("gpt-5.1")).await?; + + let call_id = "shell-command-call-login-false"; + mount_shell_responses(&harness, call_id, "echo 'hello, world'", Some(false)).await; + harness.submit("run the echo command without login").await?; + + let output = harness.function_call_stdout(call_id).await; + assert_shell_command_output(&output, "hello, world")?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn multi_line_output_with_login() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let harness = shell_command_harness_with(|builder| builder.with_model("gpt-5.1")).await?; + + let call_id = "shell-command-call-first-extra-login"; + mount_shell_responses( + &harness, + call_id, + "echo 'first line\nsecond line'", + Some(true), + ) + .await; + harness.submit("run the command with login").await?; + + let output = harness.function_call_stdout(call_id).await; + assert_shell_command_output(&output, "first line\nsecond line")?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn pipe_output_with_login() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + skip_if_windows!(Ok(())); + + let harness = shell_command_harness_with(|builder| builder.with_model("gpt-5.1")).await?; + + let call_id = "shell-command-call-second-extra-no-login"; + mount_shell_responses(&harness, call_id, "echo 'hello, world' | cat", None).await; + harness.submit("run the command without login").await?; + + let output = harness.function_call_stdout(call_id).await; + assert_shell_command_output(&output, "hello, world")?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn pipe_output_without_login() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + skip_if_windows!(Ok(())); + + let harness = shell_command_harness_with(|builder| builder.with_model("gpt-5.1")).await?; + + let call_id = "shell-command-call-third-extra-login-false"; + mount_shell_responses(&harness, call_id, "echo 'hello, world' | cat", Some(false)).await; + harness.submit("run the command without login").await?; + + let output = harness.function_call_stdout(call_id).await; + assert_shell_command_output(&output, "hello, world")?; + + Ok(()) +} diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs index daf98152d..f93c157b7 100644 --- a/codex-rs/protocol/src/models.rs +++ b/codex-rs/protocol/src/models.rs @@ -348,6 +348,9 @@ pub struct ShellCommandToolCallParams { pub command: String, pub workdir: Option, + /// Whether to run the shell with login shell semantics + #[serde(skip_serializing_if = "Option::is_none")] + pub login: Option, /// This is the maximum time in milliseconds that the command is allowed to run. #[serde(alias = "timeout")] pub timeout_ms: Option,