Avoid double truncation (#6631)

1. Avoid double truncation by giving 10% above the tool default constant
2. Add tests that fails when const = 1
This commit is contained in:
Ahmed Ibrahim 2025-11-13 16:59:31 -08:00 committed by GitHub
parent 7b027e7536
commit 9890ceb939
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 264 additions and 35 deletions

View file

@ -1,5 +1,6 @@
use crate::codex::TurnContext;
use crate::context_manager::normalize;
use crate::context_manager::truncate;
use crate::context_manager::truncate::format_output_for_model_body;
use crate::context_manager::truncate::globally_truncate_function_output_items;
use codex_protocol::models::FunctionCallOutputPayload;
@ -9,6 +10,12 @@ use codex_protocol::protocol::TokenUsageInfo;
use codex_utils_tokenizer::Tokenizer;
use std::ops::Deref;
const CONTEXT_WINDOW_HARD_LIMIT_FACTOR: f64 = 1.1;
const CONTEXT_WINDOW_HARD_LIMIT_BYTES: usize =
(truncate::MODEL_FORMAT_MAX_BYTES as f64 * CONTEXT_WINDOW_HARD_LIMIT_FACTOR) as usize;
const CONTEXT_WINDOW_HARD_LIMIT_LINES: usize =
(truncate::MODEL_FORMAT_MAX_LINES as f64 * CONTEXT_WINDOW_HARD_LIMIT_FACTOR) as usize;
/// Transcript of conversation history
#[derive(Debug, Clone, Default)]
pub(crate) struct ContextManager {
@ -146,7 +153,11 @@ impl ContextManager {
fn process_item(item: &ResponseItem) -> ResponseItem {
match item {
ResponseItem::FunctionCallOutput { call_id, output } => {
let truncated = format_output_for_model_body(output.content.as_str());
let truncated = format_output_for_model_body(
output.content.as_str(),
CONTEXT_WINDOW_HARD_LIMIT_BYTES,
CONTEXT_WINDOW_HARD_LIMIT_LINES,
);
let truncated_items = output
.content_items
.as_ref()
@ -161,7 +172,11 @@ impl ContextManager {
}
}
ResponseItem::CustomToolCallOutput { call_id, output } => {
let truncated = format_output_for_model_body(output);
let truncated = format_output_for_model_body(
output,
CONTEXT_WINDOW_HARD_LIMIT_BYTES,
CONTEXT_WINDOW_HARD_LIMIT_LINES,
);
ResponseItem::CustomToolCallOutput {
call_id: call_id.clone(),
output: truncated,

View file

@ -1,4 +1,5 @@
use super::*;
use crate::context_manager::MODEL_FORMAT_MAX_LINES;
use crate::context_manager::truncate;
use codex_git::GhostCommit;
use codex_protocol::models::ContentItem;
@ -308,8 +309,10 @@ fn assert_truncated_message_matches(message: &str, line: &str, total_lines: usiz
}
fn truncated_message_pattern(line: &str, total_lines: usize) -> String {
let head_take = truncate::MODEL_FORMAT_HEAD_LINES.min(total_lines);
let tail_take = truncate::MODEL_FORMAT_TAIL_LINES.min(total_lines.saturating_sub(head_take));
let head_lines = MODEL_FORMAT_MAX_LINES / 2;
let tail_lines = MODEL_FORMAT_MAX_LINES - head_lines;
let head_take = head_lines.min(total_lines);
let tail_take = tail_lines.min(total_lines.saturating_sub(head_take));
let omitted = total_lines.saturating_sub(head_take + tail_take);
let escaped_line = regex_lite::escape(line);
if omitted == 0 {
@ -328,7 +331,11 @@ fn format_exec_output_truncates_large_error() {
let line = "very long execution error line that should trigger truncation\n";
let large_error = line.repeat(2_500); // way beyond both byte and line limits
let truncated = truncate::format_output_for_model_body(&large_error);
let truncated = truncate::format_output_for_model_body(
&large_error,
truncate::MODEL_FORMAT_MAX_BYTES,
truncate::MODEL_FORMAT_MAX_LINES,
);
let total_lines = large_error.lines().count();
assert_truncated_message_matches(&truncated, line, total_lines);
@ -338,7 +345,11 @@ fn format_exec_output_truncates_large_error() {
#[test]
fn format_exec_output_marks_byte_truncation_without_omitted_lines() {
let long_line = "a".repeat(truncate::MODEL_FORMAT_MAX_BYTES + 50);
let truncated = truncate::format_output_for_model_body(&long_line);
let truncated = truncate::format_output_for_model_body(
&long_line,
truncate::MODEL_FORMAT_MAX_BYTES,
truncate::MODEL_FORMAT_MAX_LINES,
);
assert_ne!(truncated, long_line);
let marker_line = format!(
@ -359,7 +370,14 @@ fn format_exec_output_marks_byte_truncation_without_omitted_lines() {
fn format_exec_output_returns_original_when_within_limits() {
let content = "example output\n".repeat(10);
assert_eq!(truncate::format_output_for_model_body(&content), content);
assert_eq!(
truncate::format_output_for_model_body(
&content,
truncate::MODEL_FORMAT_MAX_BYTES,
truncate::MODEL_FORMAT_MAX_LINES
),
content
);
}
#[test]
@ -369,7 +387,11 @@ fn format_exec_output_reports_omitted_lines_and_keeps_head_and_tail() {
.map(|idx| format!("line-{idx}\n"))
.collect();
let truncated = truncate::format_output_for_model_body(&content);
let truncated = truncate::format_output_for_model_body(
&content,
truncate::MODEL_FORMAT_MAX_BYTES,
truncate::MODEL_FORMAT_MAX_LINES,
);
let omitted = total_lines - truncate::MODEL_FORMAT_MAX_LINES;
let expected_marker = format!("[... omitted {omitted} of {total_lines} lines ...]");
@ -397,7 +419,11 @@ fn format_exec_output_prefers_line_marker_when_both_limits_exceeded() {
.map(|idx| format!("line-{idx}-{long_line}\n"))
.collect();
let truncated = truncate::format_output_for_model_body(&content);
let truncated = truncate::format_output_for_model_body(
&content,
truncate::MODEL_FORMAT_MAX_BYTES,
truncate::MODEL_FORMAT_MAX_LINES,
);
assert!(
truncated.contains("[... omitted 42 of 298 lines ...]"),

View file

@ -3,4 +3,6 @@ mod normalize;
mod truncate;
pub(crate) use history::ContextManager;
pub(crate) use truncate::MODEL_FORMAT_MAX_BYTES;
pub(crate) use truncate::MODEL_FORMAT_MAX_LINES;
pub(crate) use truncate::format_output_for_model_body;

View file

@ -2,12 +2,11 @@ use codex_protocol::models::FunctionCallOutputContentItem;
use codex_utils_string::take_bytes_at_char_boundary;
use codex_utils_string::take_last_bytes_at_char_boundary;
use crate::util::error_or_panic;
// Model-formatting limits: clients get full streams; only content sent to the model is truncated.
pub(crate) const MODEL_FORMAT_MAX_BYTES: usize = 10 * 1024; // 10 KiB
pub(crate) const MODEL_FORMAT_MAX_LINES: usize = 256; // lines
pub(crate) const MODEL_FORMAT_HEAD_LINES: usize = MODEL_FORMAT_MAX_LINES / 2;
pub(crate) const MODEL_FORMAT_TAIL_LINES: usize = MODEL_FORMAT_MAX_LINES - MODEL_FORMAT_HEAD_LINES; // 128
pub(crate) const MODEL_FORMAT_HEAD_BYTES: usize = MODEL_FORMAT_MAX_BYTES / 2;
pub const MODEL_FORMAT_MAX_BYTES: usize = 10 * 1024; // 10 KiB
pub const MODEL_FORMAT_MAX_LINES: usize = 256; // lines
pub(crate) fn globally_truncate_function_output_items(
items: &[FunctionCallOutputContentItem],
@ -56,21 +55,34 @@ pub(crate) fn globally_truncate_function_output_items(
out
}
pub(crate) fn format_output_for_model_body(content: &str) -> String {
pub(crate) fn format_output_for_model_body(
content: &str,
limit_bytes: usize,
limit_lines: usize,
) -> String {
// Head+tail truncation for the model: show the beginning and end with an elision.
// Clients still receive full streams; only this formatted summary is capped.
let total_lines = content.lines().count();
if content.len() <= MODEL_FORMAT_MAX_BYTES && total_lines <= MODEL_FORMAT_MAX_LINES {
if content.len() <= limit_bytes && total_lines <= limit_lines {
return content.to_string();
}
let output = truncate_formatted_exec_output(content, total_lines);
let output = truncate_formatted_exec_output(content, total_lines, limit_bytes, limit_lines);
format!("Total output lines: {total_lines}\n\n{output}")
}
fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String {
fn truncate_formatted_exec_output(
content: &str,
total_lines: usize,
limit_bytes: usize,
limit_lines: usize,
) -> String {
debug_panic_on_double_truncation(content);
let head_lines: usize = limit_lines / 2;
let tail_lines: usize = limit_lines - head_lines; // 128
let head_bytes: usize = limit_bytes / 2;
let segments: Vec<&str> = content.split_inclusive('\n').collect();
let head_take = MODEL_FORMAT_HEAD_LINES.min(segments.len());
let tail_take = MODEL_FORMAT_TAIL_LINES.min(segments.len().saturating_sub(head_take));
let head_take = head_lines.min(segments.len());
let tail_take = tail_lines.min(segments.len().saturating_sub(head_take));
let omitted = segments.len().saturating_sub(head_take + tail_take);
let head_slice_end: usize = segments
@ -91,7 +103,7 @@ fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String {
};
let head_slice = &content[..head_slice_end];
let tail_slice = &content[tail_slice_start..];
let truncated_by_bytes = content.len() > MODEL_FORMAT_MAX_BYTES;
let truncated_by_bytes = content.len() > limit_bytes;
// this is a bit wrong. We are counting metadata lines and not just shell output lines.
let marker = if omitted > 0 {
Some(format!(
@ -99,24 +111,24 @@ fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String {
))
} else if truncated_by_bytes {
Some(format!(
"\n[... output truncated to fit {MODEL_FORMAT_MAX_BYTES} bytes ...]\n\n"
"\n[... output truncated to fit {limit_bytes} bytes ...]\n\n"
))
} else {
None
};
let marker_len = marker.as_ref().map_or(0, String::len);
let base_head_budget = MODEL_FORMAT_HEAD_BYTES.min(MODEL_FORMAT_MAX_BYTES);
let head_budget = base_head_budget.min(MODEL_FORMAT_MAX_BYTES.saturating_sub(marker_len));
let base_head_budget = head_bytes.min(limit_bytes);
let head_budget = base_head_budget.min(limit_bytes.saturating_sub(marker_len));
let head_part = take_bytes_at_char_boundary(head_slice, head_budget);
let mut result = String::with_capacity(MODEL_FORMAT_MAX_BYTES.min(content.len()));
let mut result = String::with_capacity(limit_bytes.min(content.len()));
result.push_str(head_part);
if let Some(marker_text) = marker.as_ref() {
result.push_str(marker_text);
}
let remaining = MODEL_FORMAT_MAX_BYTES.saturating_sub(result.len());
let remaining = limit_bytes.saturating_sub(result.len());
if remaining == 0 {
return result;
}
@ -126,3 +138,11 @@ fn truncate_formatted_exec_output(content: &str, total_lines: usize) -> String {
result
}
fn debug_panic_on_double_truncation(content: &str) {
if content.contains("Total output lines:") && content.contains("omitted") {
error_or_panic(format!(
"FunctionCallOutput content was already truncated before ContextManager::record_items; this would cause double truncation {content}"
));
}
}

View file

@ -9,6 +9,8 @@ pub mod runtimes;
pub mod sandboxing;
pub mod spec;
use crate::context_manager::MODEL_FORMAT_MAX_BYTES;
use crate::context_manager::MODEL_FORMAT_MAX_LINES;
use crate::context_manager::format_output_for_model_body;
use crate::exec::ExecToolCallOutput;
pub use router::ToolRouter;
@ -75,5 +77,5 @@ pub fn format_exec_output_str(exec_output: &ExecToolCallOutput) -> String {
};
// Truncate for model consumption before serialization.
format_output_for_model_body(&body)
format_output_for_model_body(&body, MODEL_FORMAT_MAX_BYTES, MODEL_FORMAT_MAX_LINES)
}

View file

@ -19,6 +19,7 @@ use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::mount_sse_once;
use core_test_support::responses::mount_sse_once_match;
use core_test_support::responses::mount_sse_sequence;
use core_test_support::responses::sse;
@ -86,7 +87,7 @@ async fn truncate_function_error_trims_respond_to_model() -> Result<()> {
serde_json::from_str::<serde_json::Value>(&output).is_err(),
"expected error output to be plain text",
);
let truncated_pattern = r#"(?s)^Total output lines: 1\s+.*\[\.\.\. output truncated to fit 10240 bytes \.\.\.\]\s*$"#;
let truncated_pattern = r#"(?s)^Total output lines: 1\s+.*\[\.\.\. output truncated to fit 11264 bytes \.\.\.\]\s*$"#;
assert_regex_match(truncated_pattern, &output);
assert!(
!output.contains("omitted"),
@ -113,15 +114,25 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
let fixture = builder.build(&server).await?;
let call_id = "shell-too-large";
let args = serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 400"],
"timeout_ms": 5_000,
});
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 400; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 400"],
"timeout_ms": 5_000,
})
};
// First response: model tells us to run the tool; second: complete the turn.
mount_sse_once_match(
mount_sse_once(
&server,
any(),
sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
@ -149,6 +160,7 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
.single_request()
.function_call_output_text(call_id)
.context("function_call_output present for shell call")?;
let output = output.replace("\r\n", "\n");
// Expect plain text (not JSON) with truncation markers and line elision.
assert!(
@ -180,6 +192,75 @@ $"#;
Ok(())
}
// Ensures shell tool outputs that exceed the line limit are truncated only once.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_call_output_truncated_only_once() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex().with_config(|config| {
config.model = "gpt-5-codex".to_string();
config.model_family =
find_family_for_model("gpt-5-codex").expect("gpt-5-codex is a model family");
});
let fixture = builder.build(&server).await?;
let call_id = "shell-single-truncation";
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 2000; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 2000"],
"timeout_ms": 5_000,
})
};
mount_sse_once_match(
&server,
any(),
sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
responses::ev_completed("resp-1"),
]),
)
.await;
let mock2 = mount_sse_once_match(
&server,
any(),
sse(vec![
responses::ev_assistant_message("msg-1", "done"),
responses::ev_completed("resp-2"),
]),
)
.await;
fixture
.submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess)
.await?;
let output = mock2
.single_request()
.function_call_output_text(call_id)
.context("function_call_output present for shell call")?;
let total_line_headers = output.matches("Total output lines:").count();
assert_eq!(
total_line_headers, 1,
"shell output should carry only one truncation header: {output}"
);
Ok(())
}
// Verifies that an MCP tool call result exceeding the model formatting limits
// is truncated before being sent back to the model.
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
@ -269,7 +350,8 @@ async fn mcp_tool_call_output_exceeds_limit_truncated_for_model() -> Result<()>
output.starts_with("Total output lines: 1\n\n{"),
"expected total line header and JSON head, got: {output}"
);
let byte_marker = Regex::new(r"\[\.\.\. output truncated to fit 10240 bytes \.\.\.\]")
let byte_marker = Regex::new(r"\[\.\.\. output truncated to fit 11264 bytes \.\.\.\]")
.expect("compile regex");
assert!(
byte_marker.is_match(&output),

View file

@ -1,18 +1,31 @@
use anyhow::Context;
use codex_core::ConversationManager;
use codex_core::NewConversation;
use codex_core::model_family::find_family_for_model;
use codex_core::protocol::EventMsg;
use codex_core::protocol::ExecCommandEndEvent;
use codex_core::protocol::ExecOutputStream;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_core::protocol::TurnAbortReason;
use core_test_support::assert_regex_match;
use core_test_support::load_default_config_for_test;
use core_test_support::responses;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::mount_sse_once_match;
use core_test_support::responses::sse;
use core_test_support::responses::start_mock_server;
use core_test_support::skip_if_no_network;
use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event;
use core_test_support::wait_for_event_match;
use regex_lite::escape;
use std::path::PathBuf;
use tempfile::TempDir;
use wiremock::matchers::any;
#[tokio::test]
async fn user_shell_cmd_ls_and_cat_in_temp_dir() {
@ -249,3 +262,72 @@ async fn user_shell_command_output_is_truncated_in_history() -> anyhow::Result<(
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex().with_config(|config| {
config.model = "gpt-5-codex".to_string();
config.model_family =
find_family_for_model("gpt-5-codex").expect("gpt-5-codex is a model family");
});
let fixture = builder.build(&server).await?;
let call_id = "user-shell-double-truncation";
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 2000; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 2000"],
"timeout_ms": 5_000,
})
};
mount_sse_once_match(
&server,
any(),
sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
ev_completed("resp-1"),
]),
)
.await;
let mock2 = mount_sse_once_match(
&server,
any(),
sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
)
.await;
fixture
.submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess)
.await?;
let output = mock2
.single_request()
.function_call_output_text(call_id)
.context("function_call_output present for shell call")?;
let truncation_headers = output.matches("Total output lines:").count();
assert_eq!(
truncation_headers, 1,
"shell output should carry only one truncation header: {output}"
);
Ok(())
}