From a8b4b569fbc92e910a9906178c530fdf4aa9a2ad Mon Sep 17 00:00:00 2001 From: Dylan Hurd Date: Fri, 20 Feb 2026 22:02:35 -0800 Subject: [PATCH] fix(core) Filter non-matching prefix rules (#12314) ## Summary `gpt-5.3-codex` really likes to write complicated shell scripts, and suggest a partial prefix_rule that wouldn't actually approve the command. We should only show the `prefix_rule` suggestion from the model if it would actually fully approve the command the user is seeing. This will technically cause more instances of overly-specific suggestions when we fallback, but I think the UX is clearer, particularly when the model doesn't necessarily understand the current limitations of execpolicy parsing. ## Testing - [x] Add unit tests - [x] Add integration tests --- codex-rs/core/src/exec_policy.rs | 113 ++++++++++++++-- codex-rs/core/tests/suite/approvals.rs | 178 +++++++++++++++++++++++++ 2 files changed, 278 insertions(+), 13 deletions(-) diff --git a/codex-rs/core/src/exec_policy.rs b/codex-rs/core/src/exec_policy.rs index f2cedb3f9..52329a255 100644 --- a/codex-rs/core/src/exec_policy.rs +++ b/codex-rs/core/src/exec_policy.rs @@ -224,6 +224,9 @@ impl ExecPolicyManager { let requested_amendment = derive_requested_execpolicy_amendment_from_prefix_rule( prefix_rule.as_ref(), &evaluation.matched_rules, + exec_policy.as_ref(), + &commands, + &exec_policy_fallback, ); match evaluation.decision { @@ -592,6 +595,9 @@ fn try_derive_execpolicy_amendment_for_allow_rules( fn derive_requested_execpolicy_amendment_from_prefix_rule( prefix_rule: Option<&Vec>, matched_rules: &[RuleMatch], + exec_policy: &Policy, + commands: &[Vec], + exec_policy_fallback: &impl Fn(&[String]) -> Decision, ) -> Option { let prefix_rule = prefix_rule?; if prefix_rule.is_empty() { @@ -612,7 +618,39 @@ fn derive_requested_execpolicy_amendment_from_prefix_rule( return None; } - Some(ExecPolicyAmendment::new(prefix_rule.clone())) + let amendment = ExecPolicyAmendment::new(prefix_rule.clone()); + if prefix_rule_would_approve_all_commands( + exec_policy, + &amendment.command, + commands, + exec_policy_fallback, + ) { + Some(amendment) + } else { + None + } +} + +fn prefix_rule_would_approve_all_commands( + exec_policy: &Policy, + prefix_rule: &[String], + commands: &[Vec], + exec_policy_fallback: &impl Fn(&[String]) -> Decision, +) -> bool { + let mut policy_with_prefix_rule = exec_policy.clone(); + if policy_with_prefix_rule + .add_prefix_rule(prefix_rule, Decision::Allow) + .is_err() + { + return false; + } + + commands.iter().all(|command| { + policy_with_prefix_rule + .check(command, exec_policy_fallback) + .decision + == Decision::Allow + }) } /// Only return a reason when a policy rule drove the prompt decision. @@ -1125,7 +1163,7 @@ prefix_rule(pattern=["rm"], decision="forbidden") } #[tokio::test] - async fn keeps_requested_amendment_for_heredoc_fallback_prompts() { + async fn drops_requested_amendment_for_heredoc_fallback_prompts_when_it_wont_match() { let command = vec![ "bash".to_string(), "-lc".to_string(), @@ -1147,7 +1185,7 @@ prefix_rule(pattern=["rm"], decision="forbidden") requirement, ExecApprovalRequirement::NeedsApproval { reason: None, - proposed_execpolicy_amendment: Some(ExecPolicyAmendment::new(requested_prefix)), + proposed_execpolicy_amendment: None, } ); } @@ -1472,6 +1510,38 @@ prefix_rule( ); } + #[tokio::test] + async fn request_rule_falls_back_when_prefix_rule_does_not_approve_all_commands() { + let command = vec![ + "bash".to_string(), + "-lc".to_string(), + "cargo install cargo-insta && rm -rf /tmp/codex".to_string(), + ]; + let manager = ExecPolicyManager::default(); + + let requirement = manager + .create_exec_approval_requirement_for_command(ExecApprovalRequest { + command: &command, + approval_policy: AskForApproval::OnRequest, + sandbox_policy: &SandboxPolicy::DangerFullAccess, + sandbox_permissions: SandboxPermissions::RequireEscalated, + prefix_rule: Some(vec!["cargo".to_string(), "install".to_string()]), + }) + .await; + + assert_eq!( + requirement, + ExecApprovalRequirement::NeedsApproval { + reason: None, + proposed_execpolicy_amendment: Some(ExecPolicyAmendment::new(vec![ + "rm".to_string(), + "-rf".to_string(), + "/tmp/codex".to_string(), + ])), + } + ); + } + #[tokio::test] async fn heuristics_apply_when_other_commands_match_policy() { let policy_src = r#"prefix_rule(pattern=["apple"], decision="allow")"#; @@ -1728,11 +1798,28 @@ prefix_rule( ); } + fn derive_requested_execpolicy_amendment_for_test( + prefix_rule: Option<&Vec>, + matched_rules: &[RuleMatch], + ) -> Option { + let commands = prefix_rule + .cloned() + .map(|prefix_rule| vec![prefix_rule]) + .unwrap_or_else(|| vec![vec!["echo".to_string()]]); + derive_requested_execpolicy_amendment_from_prefix_rule( + prefix_rule, + matched_rules, + &Policy::empty(), + &commands, + &|_: &[String]| Decision::Allow, + ) + } + #[test] fn derive_requested_execpolicy_amendment_returns_none_for_missing_prefix_rule() { assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule(None, &[]) + derive_requested_execpolicy_amendment_for_test(None, &[]) ); } @@ -1740,7 +1827,7 @@ prefix_rule( fn derive_requested_execpolicy_amendment_returns_none_for_empty_prefix_rule() { assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule(Some(&Vec::new()), &[]) + derive_requested_execpolicy_amendment_for_test(Some(&Vec::new()), &[]) ); } @@ -1748,7 +1835,7 @@ prefix_rule( fn derive_requested_execpolicy_amendment_returns_none_for_exact_banned_prefix_rule() { assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule( + derive_requested_execpolicy_amendment_for_test( Some(&vec!["python".to_string(), "-c".to_string()]), &[], ) @@ -1767,7 +1854,7 @@ prefix_rule( ] { assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule(Some(&prefix_rule), &[]) + derive_requested_execpolicy_amendment_for_test(Some(&prefix_rule), &[]) ); } } @@ -1793,7 +1880,7 @@ prefix_rule( ] { assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule(Some(&prefix_rule), &[]) + derive_requested_execpolicy_amendment_for_test(Some(&prefix_rule), &[]) ); } } @@ -1808,7 +1895,7 @@ prefix_rule( assert_eq!( Some(ExecPolicyAmendment::new(prefix_rule.clone())), - derive_requested_execpolicy_amendment_from_prefix_rule(Some(&prefix_rule), &[]) + derive_requested_execpolicy_amendment_for_test(Some(&prefix_rule), &[]) ); } @@ -1823,7 +1910,7 @@ prefix_rule( }]; assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule( + derive_requested_execpolicy_amendment_for_test( Some(&prefix_rule), &matched_rules_prompt ), @@ -1836,7 +1923,7 @@ prefix_rule( }]; assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule( + derive_requested_execpolicy_amendment_for_test( Some(&prefix_rule), &matched_rules_allow ), @@ -1849,9 +1936,9 @@ prefix_rule( }]; assert_eq!( None, - derive_requested_execpolicy_amendment_from_prefix_rule( + derive_requested_execpolicy_amendment_for_test( Some(&prefix_rule), - &matched_rules_forbidden + &matched_rules_forbidden, ), "should return none when prompt policy matches" ); diff --git a/codex-rs/core/tests/suite/approvals.rs b/codex-rs/core/tests/suite/approvals.rs index 14d9c79e5..374cc1ada 100644 --- a/codex-rs/core/tests/suite/approvals.rs +++ b/codex-rs/core/tests/suite/approvals.rs @@ -184,6 +184,16 @@ fn shell_event( command: &str, timeout_ms: u64, sandbox_permissions: SandboxPermissions, +) -> Result { + shell_event_with_prefix_rule(call_id, command, timeout_ms, sandbox_permissions, None) +} + +fn shell_event_with_prefix_rule( + call_id: &str, + command: &str, + timeout_ms: u64, + sandbox_permissions: SandboxPermissions, + prefix_rule: Option>, ) -> Result { let mut args = json!({ "command": command, @@ -192,6 +202,9 @@ fn shell_event( if sandbox_permissions.requires_escalated_permissions() { args["sandbox_permissions"] = json!(sandbox_permissions); } + if let Some(prefix_rule) = prefix_rule { + args["prefix_rule"] = json!(prefix_rule); + } let args_str = serde_json::to_string(&args)?; Ok(ev_function_call(call_id, "shell_command", &args_str)) } @@ -1928,6 +1941,171 @@ async fn approving_execpolicy_amendment_persists_policy_and_skips_future_prompts Ok(()) } +#[tokio::test(flavor = "current_thread")] +#[cfg(unix)] +async fn invalid_requested_prefix_rule_falls_back_for_compound_command() -> Result<()> { + let server = start_mock_server().await; + let approval_policy = AskForApproval::OnRequest; + let sandbox_policy = SandboxPolicy::new_read_only_policy(); + let sandbox_policy_for_config = sandbox_policy.clone(); + let mut builder = test_codex().with_config(move |config| { + config.permissions.approval_policy = Constrained::allow_any(approval_policy); + config.permissions.sandbox_policy = Constrained::allow_any(sandbox_policy_for_config); + }); + let test = builder.build(&server).await?; + + let call_id = "invalid-prefix-rule"; + let command = + "touch /tmp/codex-fallback-rule-test.txt && echo hello > /tmp/codex-fallback-rule-test.txt"; + let event = shell_event_with_prefix_rule( + call_id, + command, + 1_000, + SandboxPermissions::RequireEscalated, + Some(vec!["touch".to_string()]), + )?; + + let _ = mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-invalid-prefix-1"), + event, + ev_completed("resp-invalid-prefix-1"), + ]), + ) + .await; + + submit_turn( + &test, + "invalid-prefix-rule", + approval_policy, + sandbox_policy.clone(), + ) + .await?; + + let approval = expect_exec_approval(&test, command).await; + let amendment = approval + .proposed_execpolicy_amendment + .expect("should have a proposed execpolicy amendment"); + assert!(amendment.command.contains(&command.to_string())); + + Ok(()) +} + +#[tokio::test(flavor = "current_thread")] +#[cfg(unix)] +async fn approving_fallback_rule_for_compound_command_works() -> Result<()> { + let server = start_mock_server().await; + let approval_policy = AskForApproval::OnRequest; + let sandbox_policy = SandboxPolicy::new_read_only_policy(); + let sandbox_policy_for_config = sandbox_policy.clone(); + let mut builder = test_codex().with_config(move |config| { + config.permissions.approval_policy = Constrained::allow_any(approval_policy); + config.permissions.sandbox_policy = Constrained::allow_any(sandbox_policy_for_config); + }); + let test = builder.build(&server).await?; + + let call_id = "invalid-prefix-rule"; + let command = + "touch /tmp/codex-fallback-rule-test.txt && echo hello > /tmp/codex-fallback-rule-test.txt"; + let event = shell_event_with_prefix_rule( + call_id, + command, + 1_000, + SandboxPermissions::RequireEscalated, + Some(vec!["touch".to_string()]), + )?; + + let _ = mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-invalid-prefix-1"), + event, + ev_completed("resp-invalid-prefix-1"), + ]), + ) + .await; + + submit_turn( + &test, + "invalid-prefix-rule", + approval_policy, + sandbox_policy.clone(), + ) + .await?; + + let approval = expect_exec_approval(&test, command).await; + let approval_id = approval.effective_approval_id(); + let amendment = approval + .proposed_execpolicy_amendment + .expect("should have a proposed execpolicy amendment"); + assert!(amendment.command.contains(&command.to_string())); + + test.codex + .submit(Op::ExecApproval { + id: approval_id, + turn_id: None, + decision: ReviewDecision::ApprovedExecpolicyAmendment { + proposed_execpolicy_amendment: amendment.clone(), + }, + }) + .await?; + wait_for_completion(&test).await; + + let call_id = "invalid-prefix-rule-again"; + let command = + "touch /tmp/codex-fallback-rule-test.txt && echo hello > /tmp/codex-fallback-rule-test.txt"; + let event = shell_event_with_prefix_rule( + call_id, + command, + 1_000, + SandboxPermissions::RequireEscalated, + Some(vec!["touch".to_string()]), + )?; + + let _ = mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-invalid-prefix-1"), + event, + ev_completed("resp-invalid-prefix-1"), + ]), + ) + .await; + let second_results = mount_sse_once( + &server, + sse(vec![ + ev_assistant_message("msg-invalid-prefix-1", "done"), + ev_completed("resp-invalid-prefix-2"), + ]), + ) + .await; + + submit_turn( + &test, + "invalid-prefix-rule", + approval_policy, + sandbox_policy.clone(), + ) + .await?; + + wait_for_completion_without_approval(&test).await; + + let second_output = parse_result( + &second_results + .single_request() + .function_call_output(call_id), + ); + assert_eq!(second_output.exit_code.unwrap_or(0), 0); + assert!( + second_output.stdout.is_empty(), + "unexpected stdout: {}", + second_output.stdout + ); + + Ok(()) +} + // todo(dylan) add ScenarioSpec support for rules #[tokio::test(flavor = "current_thread")] #[cfg(unix)]