From 9450cd9ce510ee07fdb80a3bb4782cb303803f4b Mon Sep 17 00:00:00 2001
From: Charley Cunningham <ccunningham@openai.com>
Date: Mon, 9 Feb 2026 12:42:20 -0800
Subject: [PATCH] core: add focused diagnostics for remote compaction overflows
 (#11133)

## Summary
- add targeted remote-compaction failure diagnostics in compact_remote
logging
- log the specific values needed to explain overflow timing:
  - last_api_response_total_tokens
  - estimated_tokens_of_items_added_since_last_successful_api_response
  - estimated_bytes_of_items_added_since_last_successful_api_response
  - failing_compaction_request_body_bytes
- simplify breakdown naming and remove
last_api_response_total_bytes_estimate (it was an approximation and not
useful for debugging)

## Why
When compaction fails with context_length_exceeded, we need concrete,
low-ambiguity numbers that map directly to:
1) what the API most recently reported, and
2) what local history added since then.

This keeps the failure logs actionable without adding broad, noisy
metrics.

## Testing
- just fmt
- cargo test -p codex-core
---
 codex-rs/core/src/codex.rs                    | 13 ++-
 codex-rs/core/src/compact_remote.rs           | 58 +++++++++++++
 codex-rs/core/src/context_manager/history.rs  | 83 +++++++++++++------
 .../core/src/context_manager/history_tests.rs | 15 +++-
 codex-rs/core/src/context_manager/mod.rs      |  2 +
 codex-rs/core/src/truncate.rs                 |  8 ++
 6 files changed, 150 insertions(+), 29 deletions(-)
diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs
index a6ce928b4..ceb8fcf94 100644
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -114,6 +114,7 @@ use crate::config::resolve_web_search_mode_for_turn;
 use crate::config::types::McpServerConfig;
 use crate::config::types::ShellEnvironmentPolicy;
 use crate::context_manager::ContextManager;
+use crate::context_manager::TotalTokenUsageBreakdown;
 use crate::environment_context::EnvironmentContext;
 use crate::error::CodexErr;
 use crate::error::Result as CodexResult;
@@ -1231,12 +1232,20 @@ impl Session {
         format!("auto-compact-{id}")
     }
 
-    async fn get_total_token_usage(&self) -> i64 {
+    pub(crate) async fn get_total_token_usage(&self) -> i64 {
         let state = self.state.lock().await;
         state.get_total_token_usage(state.server_reasoning_included())
     }
 
-    async fn get_estimated_token_count(&self, turn_context: &TurnContext) -> Option<i64> {
+    pub(crate) async fn get_total_token_usage_breakdown(&self) -> TotalTokenUsageBreakdown {
+        let state = self.state.lock().await;
+        state.history.get_total_token_usage_breakdown()
+    }
+
+    pub(crate) async fn get_estimated_token_count(
+        &self,
+        turn_context: &TurnContext,
+    ) -> Option<i64> {
         let state = self.state.lock().await;
         state.history.estimate_token_count(turn_context)
     }
diff --git a/codex-rs/core/src/compact_remote.rs b/codex-rs/core/src/compact_remote.rs
index eac3188cd..8c7808e3f 100644
--- a/codex-rs/core/src/compact_remote.rs
+++ b/codex-rs/core/src/compact_remote.rs
@@ -4,7 +4,10 @@ use crate::Prompt;
 use crate::codex::Session;
 use crate::codex::TurnContext;
 use crate::context_manager::ContextManager;
+use crate::context_manager::TotalTokenUsageBreakdown;
+use crate::context_manager::estimate_response_item_model_visible_bytes;
 use crate::context_manager::is_codex_generated_item;
+use crate::error::CodexErr;
 use crate::error::Result as CodexResult;
 use crate::protocol::CompactedItem;
 use crate::protocol::EventMsg;
@@ -14,6 +17,8 @@ use codex_protocol::items::ContextCompactionItem;
 use codex_protocol::items::TurnItem;
 use codex_protocol::models::BaseInstructions;
 use codex_protocol::models::ResponseItem;
+use futures::TryFutureExt;
+use tracing::error;
 use tracing::info;
 
 pub(crate) async fn run_inline_remote_auto_compact_task(
@@ -98,6 +103,18 @@ async fn run_remote_compact_task_inner_impl(
             &turn_context.model_info,
             &turn_context.otel_manager,
         )
+        .or_else(|err| async {
+            let total_usage_breakdown = sess.get_total_token_usage_breakdown().await;
+            let compact_request_log_data =
+                build_compact_request_log_data(&prompt.input, &prompt.base_instructions.text);
+            log_remote_compact_failure(
+                turn_context,
+                &compact_request_log_data,
+                total_usage_breakdown,
+                &err,
+            );
+            Err(err)
+        })
         .await?;
     new_history = sess
         .process_compacted_history(turn_context, new_history)
@@ -121,6 +138,47 @@ async fn run_remote_compact_task_inner_impl(
     Ok(())
 }
 
+#[derive(Debug)]
+struct CompactRequestLogData {
+    failing_compaction_request_model_visible_bytes: i64,
+}
+
+fn build_compact_request_log_data(
+    input: &[ResponseItem],
+    instructions: &str,
+) -> CompactRequestLogData {
+    let failing_compaction_request_model_visible_bytes = input
+        .iter()
+        .map(estimate_response_item_model_visible_bytes)
+        .fold(
+            i64::try_from(instructions.len()).unwrap_or(i64::MAX),
+            i64::saturating_add,
+        );
+
+    CompactRequestLogData {
+        failing_compaction_request_model_visible_bytes,
+    }
+}
+
+fn log_remote_compact_failure(
+    turn_context: &TurnContext,
+    log_data: &CompactRequestLogData,
+    total_usage_breakdown: TotalTokenUsageBreakdown,
+    err: &CodexErr,
+) {
+    error!(
+        turn_id = %turn_context.sub_id,
+        last_api_response_total_tokens = total_usage_breakdown.last_api_response_total_tokens,
+        all_history_items_model_visible_bytes = total_usage_breakdown.all_history_items_model_visible_bytes,
+        estimated_tokens_of_items_added_since_last_successful_api_response = total_usage_breakdown.estimated_tokens_of_items_added_since_last_successful_api_response,
+        estimated_bytes_of_items_added_since_last_successful_api_response = total_usage_breakdown.estimated_bytes_of_items_added_since_last_successful_api_response,
+        model_context_window_tokens = ?turn_context.model_context_window(),
+        failing_compaction_request_model_visible_bytes = log_data.failing_compaction_request_model_visible_bytes,
+        compact_error = %err,
+        "remote compaction failed"
+    );
+}
+
 fn trim_function_call_history_to_fit_context_window(
     history: &mut ContextManager,
     turn_context: &TurnContext,
diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs
index 4bf62d791..a5b223805 100644
--- a/codex-rs/core/src/context_manager/history.rs
+++ b/codex-rs/core/src/context_manager/history.rs
@@ -5,7 +5,7 @@ use crate::instructions::UserInstructions;
 use crate::session_prefix::is_session_prefix;
 use crate::truncate::TruncationPolicy;
 use crate::truncate::approx_token_count;
-use crate::truncate::approx_tokens_from_byte_count;
+use crate::truncate::approx_tokens_from_byte_count_i64;
 use crate::truncate::truncate_function_output_items_with_policy;
 use crate::truncate::truncate_text;
 use crate::user_shell_command::is_user_shell_command_text;
@@ -27,6 +27,14 @@ pub(crate) struct ContextManager {
     token_info: Option<TokenUsageInfo>,
 }
 
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct TotalTokenUsageBreakdown {
+    pub last_api_response_total_tokens: i64,
+    pub all_history_items_model_visible_bytes: i64,
+    pub estimated_tokens_of_items_added_since_last_successful_api_response: i64,
+    pub estimated_bytes_of_items_added_since_last_successful_api_response: i64,
+}
+
 impl ContextManager {
     pub(crate) fn new() -> Self {
         Self {
@@ -102,9 +110,11 @@ impl ContextManager {
         let base_tokens =
             i64::try_from(approx_token_count(&base_instructions.text)).unwrap_or(i64::MAX);
 
-        let items_tokens = self.items.iter().fold(0i64, |acc, item| {
-            acc.saturating_add(estimate_item_token_count(item))
-        });
+        let items_tokens = self
+            .items
+            .iter()
+            .map(estimate_item_token_count)
+            .fold(0i64, i64::saturating_add);
 
         Some(base_tokens.saturating_add(items_tokens))
     }
@@ -231,9 +241,8 @@ impl ContextManager {
                     }
                 )
             })
-            .fold(0i64, |acc, item| {
-                acc.saturating_add(estimate_item_token_count(item))
-            })
+            .map(estimate_item_token_count)
+            .fold(0i64, i64::saturating_add)
     }
 
     // These are local items added after the most recent model-emitted item.
@@ -247,14 +256,6 @@ impl ContextManager {
         &self.items[start..]
     }
 
-    fn get_items_after_last_model_generated_tokens(&self) -> i64 {
-        self.items_after_last_model_generated_item()
-            .iter()
-            .fold(0i64, |acc, item| {
-                acc.saturating_add(estimate_item_token_count(item))
-            })
-    }
-
     /// When true, the server already accounted for past reasoning tokens and
     /// the client should not re-estimate them.
     pub(crate) fn get_total_token_usage(&self, server_reasoning_included: bool) -> i64 {
@@ -263,8 +264,11 @@ impl ContextManager {
             .as_ref()
             .map(|info| info.last_token_usage.total_tokens)
             .unwrap_or(0);
-        let items_after_last_model_generated_tokens =
-            self.get_items_after_last_model_generated_tokens();
+        let items_after_last_model_generated_tokens = self
+            .items_after_last_model_generated_item()
+            .iter()
+            .map(estimate_item_token_count)
+            .fold(0i64, i64::saturating_add);
         if server_reasoning_included {
             last_tokens.saturating_add(items_after_last_model_generated_tokens)
         } else {
@@ -274,6 +278,34 @@ impl ContextManager {
         }
     }
 
+    pub(crate) fn get_total_token_usage_breakdown(&self) -> TotalTokenUsageBreakdown {
+        let last_usage = self
+            .token_info
+            .as_ref()
+            .map(|info| info.last_token_usage.clone())
+            .unwrap_or_default();
+        let items_after_last_model_generated = self.items_after_last_model_generated_item();
+
+        TotalTokenUsageBreakdown {
+            last_api_response_total_tokens: last_usage.total_tokens,
+            all_history_items_model_visible_bytes: self
+                .items
+                .iter()
+                .map(estimate_response_item_model_visible_bytes)
+                .fold(0i64, i64::saturating_add),
+            estimated_tokens_of_items_added_since_last_successful_api_response:
+                items_after_last_model_generated
+                    .iter()
+                    .map(estimate_item_token_count)
+                    .fold(0i64, i64::saturating_add),
+            estimated_bytes_of_items_added_since_last_successful_api_response:
+                items_after_last_model_generated
+                    .iter()
+                    .map(estimate_response_item_model_visible_bytes)
+                    .fold(0i64, i64::saturating_add),
+        }
+    }
+
     /// This function enforces a couple of invariants on the in-memory history:
     /// 1. every call (function/custom) has a corresponding output entry
     /// 2. every output has a corresponding call entry
@@ -357,6 +389,11 @@ fn estimate_reasoning_length(encoded_len: usize) -> usize {
 }
 
 fn estimate_item_token_count(item: &ResponseItem) -> i64 {
+    let model_visible_bytes = estimate_response_item_model_visible_bytes(item);
+    approx_tokens_from_byte_count_i64(model_visible_bytes)
+}
+
+pub(crate) fn estimate_response_item_model_visible_bytes(item: &ResponseItem) -> i64 {
     match item {
         ResponseItem::GhostSnapshot { .. } => 0,
         ResponseItem::Reasoning {
@@ -365,14 +402,10 @@ fn estimate_item_token_count(item: &ResponseItem) -> i64 {
         }
         | ResponseItem::Compaction {
             encrypted_content: content,
-        } => {
-            let reasoning_bytes = estimate_reasoning_length(content.len());
-            i64::try_from(approx_tokens_from_byte_count(reasoning_bytes)).unwrap_or(i64::MAX)
-        }
-        item => {
-            let serialized = serde_json::to_string(item).unwrap_or_default();
-            i64::try_from(approx_token_count(&serialized)).unwrap_or(i64::MAX)
-        }
+        } => i64::try_from(estimate_reasoning_length(content.len())).unwrap_or(i64::MAX),
+        item => serde_json::to_string(item)
+            .map(|serialized| i64::try_from(serialized.len()).unwrap_or(i64::MAX))
+            .unwrap_or_default(),
     }
 }
 
diff --git a/codex-rs/core/src/context_manager/history_tests.rs b/codex-rs/core/src/context_manager/history_tests.rs
index 26bff9f4b..99670c05e 100644
--- a/codex-rs/core/src/context_manager/history_tests.rs
+++ b/codex-rs/core/src/context_manager/history_tests.rs
@@ -193,7 +193,11 @@ fn items_after_last_model_generated_tokens_include_user_and_tool_output() {
     );
 
     assert_eq!(
-        history.get_items_after_last_model_generated_tokens(),
+        history
+            .items_after_last_model_generated_item()
+            .iter()
+            .map(estimate_item_token_count)
+            .fold(0i64, i64::saturating_add),
         expected_tokens
     );
 }
@@ -202,7 +206,14 @@ fn items_after_last_model_generated_tokens_include_user_and_tool_output() {
 fn items_after_last_model_generated_tokens_are_zero_without_model_generated_items() {
     let history = create_history_with_items(vec![user_msg("no model output yet")]);
 
-    assert_eq!(history.get_items_after_last_model_generated_tokens(), 0);
+    assert_eq!(
+        history
+            .items_after_last_model_generated_item()
+            .iter()
+            .map(estimate_item_token_count)
+            .fold(0i64, i64::saturating_add),
+        0
+    );
 }
 
 #[test]
diff --git a/codex-rs/core/src/context_manager/mod.rs b/codex-rs/core/src/context_manager/mod.rs
index 22e9682fe..dcaf794c4 100644
--- a/codex-rs/core/src/context_manager/mod.rs
+++ b/codex-rs/core/src/context_manager/mod.rs
@@ -2,5 +2,7 @@ mod history;
 mod normalize;
 
 pub(crate) use history::ContextManager;
+pub(crate) use history::TotalTokenUsageBreakdown;
+pub(crate) use history::estimate_response_item_model_visible_bytes;
 pub(crate) use history::is_codex_generated_item;
 pub(crate) use history::is_user_turn_boundary;
diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs
index 441a15737..309ed91b6 100644
--- a/codex-rs/core/src/truncate.rs
+++ b/codex-rs/core/src/truncate.rs
@@ -304,6 +304,14 @@ pub(crate) fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
         / (APPROX_BYTES_PER_TOKEN as u64)
 }
 
+pub(crate) fn approx_tokens_from_byte_count_i64(bytes: i64) -> i64 {
+    if bytes <= 0 {
+        return 0;
+    }
+    let bytes = usize::try_from(bytes).unwrap_or(usize::MAX);
+    i64::try_from(approx_tokens_from_byte_count(bytes)).unwrap_or(i64::MAX)
+}
+
 #[cfg(test)]
 mod tests {