Stabilize plan item app-server tests (#14058)

## What changed - run the two plan-mode app-server tests on a multi-thread Tokio runtime instead of the default single-thread test runtime - stop relying on wiremock teardown expectations for `/responses` and explicitly wait for the expected request count after the turn completes ## Why this fixes the flake - this failure was showing up on Windows ARM as a late wiremock panic saying the mock server saw zero `/responses` calls, but the real issue was that the test could stall around app-server startup and only fail during teardown - moving these tests to the same multi-thread runtime used by the other collaboration-mode app-server tests removes that startup scheduling race - asserting the `/responses` count directly makes the test deterministic: we now wait for the real POST instead of depending on a drop-time verification that can hide the underlying timing issue ## Scope - test-only change; no production logic changes
2026-03-09 10:24:18 -07:00 · 2026-03-09 10:24:18 -07:00 · 6b68d1ef66
commit 6b68d1ef66
parent 5d9db0f995
1 changed files with 40 additions and 5 deletions
--- a/codex-rs/app-server/tests/suite/v2/plan_item.rs
+++ b/codex-rs/app-server/tests/suite/v2/plan_item.rs
@ -1,7 +1,8 @@
 use anyhow::Result;
 use anyhow::anyhow;
+use anyhow::bail;
 use app_test_support::McpProcess;
-use app_test_support::create_mock_responses_server_sequence;
+use app_test_support::create_mock_responses_server_sequence_unchecked;
 use app_test_support::to_response;
 use codex_app_server_protocol::ItemCompletedNotification;
 use codex_app_server_protocol::ItemStartedNotification;
@ -28,11 +29,13 @@ use pretty_assertions::assert_eq;
 use std::collections::BTreeMap;
 use std::path::Path;
 use tempfile::TempDir;
+use tokio::time::sleep;
 use tokio::time::timeout;
+use wiremock::MockServer;

 const DEFAULT_READ_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);

-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
 async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> {
    skip_if_no_network!(Ok(()));

@ -45,7 +48,7 @@ async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> {
        responses::ev_assistant_message("msg-1", &full_message),
        responses::ev_completed("resp-1"),
    ])];
-    let server = create_mock_responses_server_sequence(responses).await;
+    let server = create_mock_responses_server_sequence_unchecked(responses).await;

    let codex_home = TempDir::new()?;
    create_config_toml(codex_home.path(), &server.uri())?;
@ -56,6 +59,7 @@ async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> {
    let turn = start_plan_mode_turn(&mut mcp).await?;
    let (_, completed_items, plan_deltas, turn_completed) =
        collect_turn_notifications(&mut mcp).await?;
+    wait_for_responses_request_count(&server, 1).await?;

    assert_eq!(turn_completed.turn.id, turn.id);
    assert_eq!(turn_completed.turn.status, TurnStatus::Completed);
@ -93,7 +97,7 @@ async fn plan_mode_uses_proposed_plan_block_for_plan_item() -> Result<()> {
    Ok(())
 }

-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
 async fn plan_mode_without_proposed_plan_does_not_emit_plan_item() -> Result<()> {
    skip_if_no_network!(Ok(()));

@ -102,7 +106,7 @@ async fn plan_mode_without_proposed_plan_does_not_emit_plan_item() -> Result<()>
        responses::ev_assistant_message("msg-1", "Done"),
        responses::ev_completed("resp-1"),
    ])];
-    let server = create_mock_responses_server_sequence(responses).await;
+    let server = create_mock_responses_server_sequence_unchecked(responses).await;

    let codex_home = TempDir::new()?;
    create_config_toml(codex_home.path(), &server.uri())?;
@ -112,6 +116,7 @@ async fn plan_mode_without_proposed_plan_does_not_emit_plan_item() -> Result<()>

    let _turn = start_plan_mode_turn(&mut mcp).await?;
    let (_, completed_items, plan_deltas, _) = collect_turn_notifications(&mut mcp).await?;
+    wait_for_responses_request_count(&server, 1).await?;

    let has_plan_item = completed_items
        .iter()
@ -214,6 +219,36 @@ async fn collect_turn_notifications(
    }
 }

+async fn wait_for_responses_request_count(
+    server: &MockServer,
+    expected_count: usize,
+) -> Result<()> {
+    timeout(DEFAULT_READ_TIMEOUT, async {
+        loop {
+            let Some(requests) = server.received_requests().await else {
+                bail!("wiremock did not record requests");
+            };
+            let responses_request_count = requests
+                .iter()
+                .filter(|request| {
+                    request.method == "POST" && request.url.path().ends_with("/responses")
+                })
+                .count();
+            if responses_request_count == expected_count {
+                return Ok::<(), anyhow::Error>(());
+            }
+            if responses_request_count > expected_count {
+                bail!(
+                    "expected exactly {expected_count} /responses requests, got {responses_request_count}"
+                );
+            }
+            sleep(std::time::Duration::from_millis(10)).await;
+        }
+    })
+    .await??;
+    Ok(())
+}
+
 fn create_config_toml(codex_home: &Path, server_uri: &str) -> std::io::Result<()> {
    let features = BTreeMap::from([(Feature::CollaborationModes, true)]);
    let feature_entries = features