From 9890ceb939948d77b0f004d91163fb7c3cb34bd8 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Thu, 13 Nov 2025 16:59:31 -0800 Subject: [PATCH] Avoid double truncation (#6631) 1. Avoid double truncation by giving 10% above the tool default constant 2. Add tests that fails when const = 1 --- codex-rs/core/src/context_manager/history.rs | 19 +++- .../core/src/context_manager/history_tests.rs | 40 ++++++-- codex-rs/core/src/context_manager/mod.rs | 2 + codex-rs/core/src/context_manager/truncate.rs | 54 ++++++---- codex-rs/core/src/tools/mod.rs | 4 +- codex-rs/core/tests/suite/truncation.rs | 98 +++++++++++++++++-- codex-rs/core/tests/suite/user_shell_cmd.rs | 82 ++++++++++++++++ 7 files changed, 264 insertions(+), 35 deletions(-) diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs index 07ff54ad1..8f8200caa 100644 --- a/codex-rs/core/src/context_manager/history.rs +++ b/codex-rs/core/src/context_manager/history.rs @@ -1,5 +1,6 @@ use crate::codex::TurnContext; use crate::context_manager::normalize; +use crate::context_manager::truncate; use crate::context_manager::truncate::format_output_for_model_body; use crate::context_manager::truncate::globally_truncate_function_output_items; use codex_protocol::models::FunctionCallOutputPayload; @@ -9,6 +10,12 @@ use codex_protocol::protocol::TokenUsageInfo; use codex_utils_tokenizer::Tokenizer; use std::ops::Deref; +const CONTEXT_WINDOW_HARD_LIMIT_FACTOR: f64 = 1.1; +const CONTEXT_WINDOW_HARD_LIMIT_BYTES: usize = + (truncate::MODEL_FORMAT_MAX_BYTES as f64 * CONTEXT_WINDOW_HARD_LIMIT_FACTOR) as usize; +const CONTEXT_WINDOW_HARD_LIMIT_LINES: usize = + (truncate::MODEL_FORMAT_MAX_LINES as f64 * CONTEXT_WINDOW_HARD_LIMIT_FACTOR) as usize; + /// Transcript of conversation history #[derive(Debug, Clone, Default)] pub(crate) struct ContextManager { @@ -146,7 +153,11 @@ impl ContextManager { fn process_item(item: &ResponseItem) -> ResponseItem { match item { ResponseItem::FunctionCallOutput { call_id, output } => { - let truncated = format_output_for_model_body(output.content.as_str()); + let truncated = format_output_for_model_body( + output.content.as_str(), + CONTEXT_WINDOW_HARD_LIMIT_BYTES, + CONTEXT_WINDOW_HARD_LIMIT_LINES, + ); let truncated_items = output .content_items .as_ref() @@ -161,7 +172,11 @@ impl ContextManager { } } ResponseItem::CustomToolCallOutput { call_id, output } => { - let truncated = format_output_for_model_body(output); + let truncated = format_output_for_model_body( + output, + CONTEXT_WINDOW_HARD_LIMIT_BYTES, + CONTEXT_WINDOW_HARD_LIMIT_LINES, + ); ResponseItem::CustomToolCallOutput { call_id: call_id.clone(), output: truncated, diff --git a/codex-rs/core/src/context_manager/history_tests.rs b/codex-rs/core/src/context_manager/history_tests.rs index ae3e0368f..a1aeacd32 100644 --- a/codex-rs/core/src/context_manager/history_tests.rs +++ b/codex-rs/core/src/context_manager/history_tests.rs @@ -1,4 +1,5 @@ use super::*; +use crate::context_manager::MODEL_FORMAT_MAX_LINES; use crate::context_manager::truncate; use codex_git::GhostCommit; use codex_protocol::models::ContentItem; @@ -308,8 +309,10 @@ fn assert_truncated_message_matches(message: &str, line: &str, total_lines: usiz } fn truncated_message_pattern(line: &str, total_lines: usize) -> String { - let head_take = truncate::MODEL_FORMAT_HEAD_LINES.min(total_lines); - let tail_take = truncate::MODEL_FORMAT_TAIL_LINES.min(total_lines.saturating_sub(head_take)); + let head_lines = MODEL_FORMAT_MAX_LINES / 2; + let tail_lines = MODEL_FORMAT_MAX_LINES - head_lines; + let head_take = head_lines.min(total_lines); + let tail_take = tail_lines.min(total_lines.saturating_sub(head_take)); let omitted = total_lines.saturating_sub(head_take + tail_take); let escaped_line = regex_lite::escape(line); if omitted == 0 { @@ -328,7 +331,11 @@ fn format_exec_output_truncates_large_error() { let line = "very long execution error line that should trigger truncation\n"; let large_error = line.repeat(2_500); // way beyond both byte and line limits - let truncated = truncate::format_output_for_model_body(&large_error); + let truncated = truncate::format_output_for_model_body( + &large_error, + truncate::MODEL_FORMAT_MAX_BYTES, + truncate::MODEL_FORMAT_MAX_LINES, + ); let total_lines = large_error.lines().count(); assert_truncated_message_matches(&truncated, line, total_lines); @@ -338,7 +345,11 @@ fn format_exec_output_truncates_large_error() { #[test] fn format_exec_output_marks_byte_truncation_without_omitted_lines() { let long_line = "a".repeat(truncate::MODEL_FORMAT_MAX_BYTES + 50); - let truncated = truncate::format_output_for_model_body(&long_line); + let truncated = truncate::format_output_for_model_body( + &long_line, + truncate::MODEL_FORMAT_MAX_BYTES, + truncate::MODEL_FORMAT_MAX_LINES, + ); assert_ne!(truncated, long_line); let marker_line = format!( @@ -359,7 +370,14 @@ fn format_exec_output_marks_byte_truncation_without_omitted_lines() { fn format_exec_output_returns_original_when_within_limits() { let content = "example output\n".repeat(10); - assert_eq!(truncate::format_output_for_model_body(&content), content); + assert_eq!( + truncate::format_output_for_model_body( + &content, + truncate::MODEL_FORMAT_MAX_BYTES, + truncate::MODEL_FORMAT_MAX_LINES + ), + content + ); } #[test] @@ -369,7 +387,11 @@ fn format_exec_output_reports_omitted_lines_and_keeps_head_and_tail() { .map(|idx| format!("line-{idx}\n")) .collect(); - let truncated = truncate::format_output_for_model_body(&content); + let truncated = truncate::format_output_for_model_body( + &content, + truncate::MODEL_FORMAT_MAX_BYTES, + truncate::MODEL_FORMAT_MAX_LINES, + ); let omitted = total_lines - truncate::MODEL_FORMAT_MAX_LINES; let expected_marker = format!("[... omitted {omitted} of {total_lines} lines ...]"); @@ -397,7 +419,11 @@ fn format_exec_output_prefers_line_marker_when_both_limits_exceeded() { .map(|idx| format!("line-{idx}-{long_line}\n")) .collect(); - let truncated = truncate::format_output_for_model_body(&content); + let truncated = truncate::format_output_for_model_body( + &content, + truncate::MODEL_FORMAT_MAX_BYTES, + truncate::MODEL_FORMAT_MAX_LINES, + ); assert!( truncated.contains("[... omitted 42 of 298 lines ...]"), diff --git a/codex-rs/core/src/context_manager/mod.rs b/codex-rs/core/src/context_manager/mod.rs index 8531e443f..d945cb95b 100644 --- a/codex-rs/core/src/context_manager/mod.rs +++ b/codex-rs/core/src/context_manager/mod.rs @@ -3,4 +3,6 @@ mod normalize; mod truncate; pub(crate) use history::ContextManager; +pub(crate) use truncate::MODEL_FORMAT_MAX_BYTES; +pub(crate) use truncate::MODEL_FORMAT_MAX_LINES; pub(crate) use truncate::format_output_for_model_body; diff --git a/codex-rs/core/src/context_manager/truncate.rs b/codex-rs/core/src/context_manager/truncate.rs index 65e6dc99a..88027ad15 100644 --- a/codex-rs/core/src/context_manager/truncate.rs +++ b/codex-rs/core/src/context_manager/truncate.rs @@ -2,12 +2,11 @@ use codex_protocol::models::FunctionCallOutputContentItem; use codex_utils_string::take_bytes_at_char_boundary; use codex_utils_string::take_last_bytes_at_char_boundary; +use crate::util::error_or_panic; + // Model-formatting limits: clients get full streams; only content sent to the model is truncated. -pub(crate) const MODEL_FORMAT_MAX_BYTES: usize = 10 * 1024; // 10 KiB -pub(crate) const MODEL_FORMAT_MAX_LINES: usize = 256; // lines -pub(crate) const MODEL_FORMAT_HEAD_LINES: usize = MODEL_FORMAT_MAX_LINES / 2; -pub(crate) const MODEL_FORMAT_TAIL_LINES: usize = MODEL_FORMAT_MAX_LINES - MODEL_FORMAT_HEAD_LINES; // 128 -pub(crate) const MODEL_FORMAT_HEAD_BYTES: usize = MODEL_FORMAT_MAX_BYTES / 2; +pub const MODEL_FORMAT_MAX_BYTES: usize = 10 * 1024; // 10 KiB +pub const MODEL_FORMAT_MAX_LINES: usize = 256; // lines pub(crate) fn globally_truncate_function_output_items( items: &[FunctionCallOutputContentItem], @@ -56,21 +55,34 @@ pub(crate) fn globally_truncate_function_output_items( out } -pub(crate) fn format_output_for_model_body(content: &str) -> String { +pub(crate) fn format_output_for_model_body( + content: &str, + limit_bytes: usize, + limit_lines: usize, +) -> String { // Head+tail truncation for the model: show the beginning and end with an elision. // Clients still receive full streams; only this formatted summary is capped. let total_lines = content.lines().count(); - if content.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES { + if content.len() <= limit_bytes && total_lines <= limit_lines { return content.to_string(); } - let output = truncate_formatted_exec_output(content, total_lines); + let output = truncate_formatted_exec_output(content, total_lines, limit_bytes, limit_lines); format!("Total output lines: {total_lines}\n\n{output}") } -fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String { +fn truncate_formatted_exec_output( + content: &str, + total_lines: usize, + limit_bytes: usize, + limit_lines: usize, +) -> String { + debug_panic_on_double_truncation(content); + let head_lines: usize = limit_lines / 2; + let tail_lines: usize = limit_lines - head_lines; // 128 + let head_bytes: usize = limit_bytes / 2; let segments: Vec<&str> = content.split_inclusive('\n').collect(); - let head_take = MODEL_FORMAT_HEAD_LINES.min(segments.len()); - let tail_take = MODEL_FORMAT_TAIL_LINES.min(segments.len().saturating_sub(head_take)); + let head_take = head_lines.min(segments.len()); + let tail_take = tail_lines.min(segments.len().saturating_sub(head_take)); let omitted = segments.len().saturating_sub(head_take + tail_take); let head_slice_end: usize = segments @@ -91,7 +103,7 @@ fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String { }; let head_slice = &content[..head_slice_end]; let tail_slice = &content[tail_slice_start..]; - let truncated_by_bytes = content.len() > MODEL_FORMAT_MAX_BYTES; + let truncated_by_bytes = content.len() > limit_bytes; // this is a bit wrong. We are counting metadata lines and not just shell output lines. let marker = if omitted > 0 { Some(format!( @@ -99,24 +111,24 @@ fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String { )) } else if truncated_by_bytes { Some(format!( - "\n[... output truncated to fit {MODEL_FORMAT_MAX_BYTES} bytes ...]\n\n" + "\n[... output truncated to fit {limit_bytes} bytes ...]\n\n" )) } else { None }; let marker_len = marker.as_ref().map_or(0, String::len); - let base_head_budget = MODEL_FORMAT_HEAD_BYTES.min(MODEL_FORMAT_MAX_BYTES); - let head_budget = base_head_budget.min(MODEL_FORMAT_MAX_BYTES.saturating_sub(marker_len)); + let base_head_budget = head_bytes.min(limit_bytes); + let head_budget = base_head_budget.min(limit_bytes.saturating_sub(marker_len)); let head_part = take_bytes_at_char_boundary(head_slice, head_budget); - let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(content.len())); + let mut result = String::with_capacity(limit_bytes.min(content.len())); result.push_str(head_part); if let Some(marker_text) = marker.as_ref() { result.push_str(marker_text); } - let remaining = MODEL_FORMAT_MAX_BYTES.saturating_sub(result.len()); + let remaining = limit_bytes.saturating_sub(result.len()); if remaining == 0 { return result; } @@ -126,3 +138,11 @@ fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String { result } + +fn debug_panic_on_double_truncation(content: &str) { + if content.contains("Total output lines:") && content.contains("omitted") { + error_or_panic(format!( + "FunctionCallOutput content was already truncated before ContextManager::record_items; this would cause double truncation {content}" + )); + } +} diff --git a/codex-rs/core/src/tools/mod.rs b/codex-rs/core/src/tools/mod.rs index 92509c857..c94a7c28d 100644 --- a/codex-rs/core/src/tools/mod.rs +++ b/codex-rs/core/src/tools/mod.rs @@ -9,6 +9,8 @@ pub mod runtimes; pub mod sandboxing; pub mod spec; +use crate::context_manager::MODEL_FORMAT_MAX_BYTES; +use crate::context_manager::MODEL_FORMAT_MAX_LINES; use crate::context_manager::format_output_for_model_body; use crate::exec::ExecToolCallOutput; pub use router::ToolRouter; @@ -75,5 +77,5 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String { }; // Truncate for model consumption before serialization. - format_output_for_model_body(&body) + format_output_for_model_body(&body, MODEL_FORMAT_MAX_BYTES, MODEL_FORMAT_MAX_LINES) } diff --git a/codex-rs/core/tests/suite/truncation.rs b/codex-rs/core/tests/suite/truncation.rs index aab95fa02..101bdc619 100644 --- a/codex-rs/core/tests/suite/truncation.rs +++ b/codex-rs/core/tests/suite/truncation.rs @@ -19,6 +19,7 @@ use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_completed; use core_test_support::responses::ev_function_call; use core_test_support::responses::ev_response_created; +use core_test_support::responses::mount_sse_once; use core_test_support::responses::mount_sse_once_match; use core_test_support::responses::mount_sse_sequence; use core_test_support::responses::sse; @@ -86,7 +87,7 @@ async fn truncate_function_error_trims_respond_to_model() -> Result<()> { serde_json::from_str::(&output).is_err(), "expected error output to be plain text", ); - let truncated_pattern = r#"(?s)^Total output lines: 1\s+.*\[\.\.\. output truncated to fit 10240 bytes \.\.\.\]\s*$"#; + let truncated_pattern = r#"(?s)^Total output lines: 1\s+.*\[\.\.\. output truncated to fit 11264 bytes \.\.\.\]\s*$"#; assert_regex_match(truncated_pattern, &output); assert!( !output.contains("omitted"), @@ -113,15 +114,25 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> { let fixture = builder.build(&server).await?; let call_id = "shell-too-large"; - let args = serde_json::json!({ - "command": ["/bin/sh", "-c", "seq 1 400"], - "timeout_ms": 5_000, - }); + let args = if cfg!(windows) { + serde_json::json!({ + "command": [ + "powershell", + "-Command", + "for ($i=1; $i -le 400; $i++) { Write-Output $i }" + ], + "timeout_ms": 5_000, + }) + } else { + serde_json::json!({ + "command": ["/bin/sh", "-c", "seq 1 400"], + "timeout_ms": 5_000, + }) + }; // First response: model tells us to run the tool; second: complete the turn. - mount_sse_once_match( + mount_sse_once( &server, - any(), sse(vec![ responses::ev_response_created("resp-1"), responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?), @@ -149,6 +160,7 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> { .single_request() .function_call_output_text(call_id) .context("function_call_output present for shell call")?; + let output = output.replace("\r\n", "\n"); // Expect plain text (not JSON) with truncation markers and line elision. assert!( @@ -180,6 +192,75 @@ $"#; Ok(()) } +// Ensures shell tool outputs that exceed the line limit are truncated only once. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn tool_call_output_truncated_only_once() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + + let mut builder = test_codex().with_config(|config| { + config.model = "gpt-5-codex".to_string(); + config.model_family = + find_family_for_model("gpt-5-codex").expect("gpt-5-codex is a model family"); + }); + let fixture = builder.build(&server).await?; + let call_id = "shell-single-truncation"; + let args = if cfg!(windows) { + serde_json::json!({ + "command": [ + "powershell", + "-Command", + "for ($i=1; $i -le 2000; $i++) { Write-Output $i }" + ], + "timeout_ms": 5_000, + }) + } else { + serde_json::json!({ + "command": ["/bin/sh", "-c", "seq 1 2000"], + "timeout_ms": 5_000, + }) + }; + + mount_sse_once_match( + &server, + any(), + sse(vec![ + responses::ev_response_created("resp-1"), + responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?), + responses::ev_completed("resp-1"), + ]), + ) + .await; + let mock2 = mount_sse_once_match( + &server, + any(), + sse(vec![ + responses::ev_assistant_message("msg-1", "done"), + responses::ev_completed("resp-2"), + ]), + ) + .await; + + fixture + .submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess) + .await?; + + let output = mock2 + .single_request() + .function_call_output_text(call_id) + .context("function_call_output present for shell call")?; + + let total_line_headers = output.matches("Total output lines:").count(); + + assert_eq!( + total_line_headers, 1, + "shell output should carry only one truncation header: {output}" + ); + + Ok(()) +} + // Verifies that an MCP tool call result exceeding the model formatting limits // is truncated before being sent back to the model. #[tokio::test(flavor = "multi_thread", worker_threads = 1)] @@ -269,7 +350,8 @@ async fn mcp_tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> output.starts_with("Total output lines: 1\n\n{"), "expected total line header and JSON head, got: {output}" ); - let byte_marker = Regex::new(r"\[\.\.\. output truncated to fit 10240 bytes \.\.\.\]") + + let byte_marker = Regex::new(r"\[\.\.\. output truncated to fit 11264 bytes \.\.\.\]") .expect("compile regex"); assert!( byte_marker.is_match(&output), diff --git a/codex-rs/core/tests/suite/user_shell_cmd.rs b/codex-rs/core/tests/suite/user_shell_cmd.rs index fa09f0056..8910161f1 100644 --- a/codex-rs/core/tests/suite/user_shell_cmd.rs +++ b/codex-rs/core/tests/suite/user_shell_cmd.rs @@ -1,18 +1,31 @@ +use anyhow::Context; use codex_core::ConversationManager; use codex_core::NewConversation; +use codex_core::model_family::find_family_for_model; use codex_core::protocol::EventMsg; use codex_core::protocol::ExecCommandEndEvent; use codex_core::protocol::ExecOutputStream; use codex_core::protocol::Op; +use codex_core::protocol::SandboxPolicy; use codex_core::protocol::TurnAbortReason; use core_test_support::assert_regex_match; use core_test_support::load_default_config_for_test; use core_test_support::responses; +use core_test_support::responses::ev_assistant_message; +use core_test_support::responses::ev_completed; +use core_test_support::responses::ev_function_call; +use core_test_support::responses::ev_response_created; +use core_test_support::responses::mount_sse_once_match; +use core_test_support::responses::sse; +use core_test_support::responses::start_mock_server; +use core_test_support::skip_if_no_network; +use core_test_support::test_codex::test_codex; use core_test_support::wait_for_event; use core_test_support::wait_for_event_match; use regex_lite::escape; use std::path::PathBuf; use tempfile::TempDir; +use wiremock::matchers::any; #[tokio::test] async fn user_shell_cmd_ls_and_cat_in_temp_dir() { @@ -249,3 +262,72 @@ async fn user_shell_command_output_is_truncated_in_history() -> anyhow::Result<( Ok(()) } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + + let mut builder = test_codex().with_config(|config| { + config.model = "gpt-5-codex".to_string(); + config.model_family = + find_family_for_model("gpt-5-codex").expect("gpt-5-codex is a model family"); + }); + let fixture = builder.build(&server).await?; + + let call_id = "user-shell-double-truncation"; + let args = if cfg!(windows) { + serde_json::json!({ + "command": [ + "powershell", + "-Command", + "for ($i=1; $i -le 2000; $i++) { Write-Output $i }" + ], + "timeout_ms": 5_000, + }) + } else { + serde_json::json!({ + "command": ["/bin/sh", "-c", "seq 1 2000"], + "timeout_ms": 5_000, + }) + }; + + mount_sse_once_match( + &server, + any(), + sse(vec![ + ev_response_created("resp-1"), + ev_function_call(call_id, "shell", &serde_json::to_string(&args)?), + ev_completed("resp-1"), + ]), + ) + .await; + let mock2 = mount_sse_once_match( + &server, + any(), + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ) + .await; + + fixture + .submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess) + .await?; + + let output = mock2 + .single_request() + .function_call_output_text(call_id) + .context("function_call_output present for shell call")?; + + let truncation_headers = output.matches("Total output lines:").count(); + + assert_eq!( + truncation_headers, 1, + "shell output should carry only one truncation header: {output}" + ); + + Ok(()) +}