From 6b68d1ef661263b0fa6bf9b1e1badffebfd64ee9 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 9 Mar 2026 10:24:18 -0700 Subject: [PATCH] Stabilize plan item app-server tests (#14058) ## What changed - run the two plan-mode app-server tests on a multi-thread Tokio runtime instead of the default single-thread test runtime - stop relying on wiremock teardown expectations for `/responses` and explicitly wait for the expected request count after the turn completes ## Why this fixes the flake - this failure was showing up on Windows ARM as a late wiremock panic saying the mock server saw zero `/responses` calls, but the real issue was that the test could stall around app-server startup and only fail during teardown - moving these tests to the same multi-thread runtime used by the other collaboration-mode app-server tests removes that startup scheduling race - asserting the `/responses` count directly makes the test deterministic: we now wait for the real POST instead of depending on a drop-time verification that can hide the underlying timing issue ## Scope - test-only change; no production logic changes --- .../app-server/tests/suite/v2/plan_item.rs | 45 ++++++++++++++++--- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/codex-rs/app-server/tests/suite/v2/plan_item.rs b/codex-rs/app-server/tests/suite/v2/plan_item.rs index 881bbb54c..58471f434 100644 --- a/codex-rs/app-server/tests/suite/v2/plan_item.rs +++ b/codex-rs/app-server/tests/suite/v2/plan_item.rs @@ -1,7 +1,8 @@ use anyhow::Result; use anyhow::anyhow; +use anyhow::bail; use app_test_support::McpProcess; -use app_test_support::create_mock_responses_server_sequence; +use app_test_support::create_mock_responses_server_sequence_unchecked; use app_test_support::to_response; use codex_app_server_protocol::ItemCompletedNotification; use codex_app_server_protocol::ItemStartedNotification; @@ -28,11 +29,13 @@ use pretty_assertions::assert_eq; use std::collections::BTreeMap; use std::path::Path; use tempfile::TempDir; +use tokio::time::sleep; use tokio::time::timeout; +use wiremock::MockServer; const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10); -#[tokio::test] +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> { skip_if_no_network!(Ok(())); @@ -45,7 +48,7 @@ async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> { responses::ev_assistant_message("msg-1", &full_message), responses::ev_completed("resp-1"), ])]; - let server = create_mock_responses_server_sequence(responses).await; + let server = create_mock_responses_server_sequence_unchecked(responses).await; let codex_home = TempDir::new()?; create_config_toml(codex_home.path(), &server.uri())?; @@ -56,6 +59,7 @@ async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> { let turn = start_plan_mode_turn(&mut mcp).await?; let (_, completed_items, plan_deltas, turn_completed) = collect_turn_notifications(&mut mcp).await?; + wait_for_responses_request_count(&server, 1).await?; assert_eq!(turn_completed.turn.id, turn.id); assert_eq!(turn_completed.turn.status, TurnStatus::Completed); @@ -93,7 +97,7 @@ async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> { Ok(()) } -#[tokio::test] +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn plan_mode_without_proposed_plan_does_not_emit_plan_item() -> Result<()> { skip_if_no_network!(Ok(())); @@ -102,7 +106,7 @@ async fn plan_mode_without_proposed_plan_does_not_emit_plan_item() -> Result<()> responses::ev_assistant_message("msg-1", "Done"), responses::ev_completed("resp-1"), ])]; - let server = create_mock_responses_server_sequence(responses).await; + let server = create_mock_responses_server_sequence_unchecked(responses).await; let codex_home = TempDir::new()?; create_config_toml(codex_home.path(), &server.uri())?; @@ -112,6 +116,7 @@ async fn plan_mode_without_proposed_plan_does_not_emit_plan_item() -> Result<()> let _turn = start_plan_mode_turn(&mut mcp).await?; let (_, completed_items, plan_deltas, _) = collect_turn_notifications(&mut mcp).await?; + wait_for_responses_request_count(&server, 1).await?; let has_plan_item = completed_items .iter() @@ -214,6 +219,36 @@ async fn collect_turn_notifications( } } +async fn wait_for_responses_request_count( + server: &MockServer, + expected_count: usize, +) -> Result<()> { + timeout(DEFAULT_READ_TIMEOUT, async { + loop { + let Some(requests) = server.received_requests().await else { + bail!("wiremock did not record requests"); + }; + let responses_request_count = requests + .iter() + .filter(|request| { + request.method == "POST" && request.url.path().ends_with("/responses") + }) + .count(); + if responses_request_count == expected_count { + return Ok::<(), anyhow::Error>(()); + } + if responses_request_count > expected_count { + bail!( + "expected exactly {expected_count} /responses requests, got {responses_request_count}" + ); + } + sleep(std::time::Duration::from_millis(10)).await; + } + }) + .await??; + Ok(()) +} + fn create_config_toml(codex_home: &Path, server_uri: &str) -> std::io::Result<()> { let features = BTreeMap::from([(Feature::CollaborationModes, true)]); let feature_entries = features