From aa3fe8abf841776d328425ac3d329bc38347fb9d Mon Sep 17 00:00:00 2001 From: Owen Lin Date: Thu, 5 Mar 2026 13:26:48 -0800 Subject: [PATCH] feat(core): persist trace_id for turns in RolloutItem::TurnContext (#13602) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a durable trace linkage for each turn by storing the active trace ID on the rollout TurnContext record stored in session rollout files. Before this change, we propagated trace context at runtime but didn’t persist a stable per-turn trace key in rollout history. That made after-the-fact debugging harder (for example, mapping a historical turn to the corresponding trace in datadog). This sets us up for much easier debugging in the future. ### What changed - Added an optional `trace_id` to TurnContextItem (rollout schema). - Added a small OTEL helper to read the current span trace ID. - Captured `trace_id` when creating `TurnContext` and included it in `to_turn_context_item()`. - Updated tests and fixtures that construct TurnContextItem so older/no-trace cases still work. ### Why this approach TurnContext is already the canonical durable per-turn metadata in rollout. This keeps ownership clean: trace linkage lives with other persisted turn metadata. --- codex-rs/core/src/codex.rs | 45 +++++++++++++++++++ .../src/codex/rollout_reconstruction_tests.rs | 8 ++++ codex-rs/core/src/rollout/recorder.rs | 1 + codex-rs/core/tests/suite/resume_warning.rs | 1 + codex-rs/otel/src/lib.rs | 1 + codex-rs/otel/src/trace_context.rs | 34 ++++++++++++++ codex-rs/protocol/src/protocol.rs | 4 ++ codex-rs/state/src/extract.rs | 2 + codex-rs/tui/src/lib.rs | 1 + 9 files changed, 97 insertions(+) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index a9e7883d3..f7f90eeaa 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -64,6 +64,7 @@ use codex_hooks::HooksConfig; use codex_network_proxy::NetworkProxy; use codex_network_proxy::NetworkProxyAuditMetadata; use codex_network_proxy::normalize_host; +use codex_otel::current_span_trace_id; use codex_otel::current_span_w3c_trace_context; use codex_otel::set_parent_from_w3c_trace_context; use codex_protocol::ThreadId; @@ -652,6 +653,7 @@ impl TurnSkillsContext { #[derive(Debug)] pub(crate) struct TurnContext { pub(crate) sub_id: String, + pub(crate) trace_id: Option, pub(crate) realtime_active: bool, pub(crate) config: Arc, pub(crate) auth_manager: Option>, @@ -740,6 +742,7 @@ impl TurnContext { Self { sub_id: self.sub_id.clone(), + trace_id: self.trace_id.clone(), realtime_active: self.realtime_active, config: Arc::new(config), auth_manager: self.auth_manager.clone(), @@ -795,6 +798,7 @@ impl TurnContext { pub(crate) fn to_turn_context_item(&self) -> TurnContextItem { TurnContextItem { turn_id: Some(self.sub_id.clone()), + trace_id: self.trace_id.clone(), cwd: self.cwd.clone(), current_date: self.current_date.clone(), timezone: self.timezone.clone(), @@ -1124,6 +1128,7 @@ impl Session { let (current_date, timezone) = local_time_context(); TurnContext { sub_id, + trace_id: current_span_trace_id(), realtime_active: false, config: per_turn_config.clone(), auth_manager: auth_manager_for_context, @@ -4773,6 +4778,7 @@ async fn spawn_review_thread( let review_turn_context = TurnContext { sub_id: review_turn_id, + trace_id: current_span_trace_id(), realtime_active: parent_turn_context.realtime_active, config: per_turn_config, auth_manager: auth_manager_for_context, @@ -6652,6 +6658,7 @@ mod tests { use codex_protocol::ThreadId; use codex_protocol::models::FunctionCallOutputBody; use codex_protocol::models::FunctionCallOutputPayload; + use tracing::Span; use crate::protocol::CompactedItem; use crate::protocol::CreditsSnapshot; @@ -7589,6 +7596,7 @@ mod tests { let previous_model = "forked-rollout-model"; let previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -8563,6 +8571,43 @@ mod tests { assert_eq!(submitted.trace, Some(expected_trace)); } + #[tokio::test] + async fn new_default_turn_captures_current_span_trace_id() { + let (session, _turn_context) = make_session_and_context().await; + + init_test_tracing(); + + let request_parent = W3cTraceContext { + traceparent: Some("00-00000000000000000000000000000011-0000000000000022-01".into()), + tracestate: Some("vendor=value".into()), + }; + let request_span = info_span!("app_server.request"); + assert!(set_parent_from_w3c_trace_context( + &request_span, + &request_parent + )); + + let turn_context_item = async { + let expected_trace_id = Span::current() + .context() + .span() + .span_context() + .trace_id() + .to_string(); + let turn_context = session.new_default_turn().await; + let turn_context_item = turn_context.to_turn_context_item(); + assert_eq!(turn_context_item.trace_id, Some(expected_trace_id)); + turn_context_item + } + .instrument(request_span) + .await; + + assert_eq!( + turn_context_item.trace_id.as_deref(), + Some("00000000000000000000000000000011") + ); + } + #[test] fn submission_dispatch_span_prefers_submission_trace_context() { init_test_tracing(); diff --git a/codex-rs/core/src/codex/rollout_reconstruction_tests.rs b/codex-rs/core/src/codex/rollout_reconstruction_tests.rs index 97dbcdd9c..6cc99a290 100644 --- a/codex-rs/core/src/codex/rollout_reconstruction_tests.rs +++ b/codex-rs/core/src/codex/rollout_reconstruction_tests.rs @@ -40,6 +40,7 @@ async fn record_initial_history_resumed_bare_turn_context_does_not_hydrate_previ let previous_model = "previous-rollout-model"; let previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -78,6 +79,7 @@ async fn record_initial_history_resumed_hydrates_previous_turn_settings_from_lif let previous_model = "previous-rollout-model"; let mut previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -739,6 +741,7 @@ async fn record_initial_history_resumed_turn_context_after_compaction_reestablis let previous_model = "previous-rollout-model"; let previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -810,6 +813,7 @@ async fn record_initial_history_resumed_turn_context_after_compaction_reestablis .expect("serialize seeded reference context item"), serde_json::to_value(Some(TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -838,6 +842,7 @@ async fn record_initial_history_resumed_aborted_turn_without_id_clears_active_tu let previous_model = "previous-rollout-model"; let previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -943,6 +948,7 @@ async fn record_initial_history_resumed_unmatched_abort_preserves_active_turn_fo let unmatched_abort_turn_id = "other-turn".to_string(); let current_context_item = TurnContextItem { turn_id: Some(current_turn_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -1044,6 +1050,7 @@ async fn record_initial_history_resumed_trailing_incomplete_turn_compaction_clea let previous_model = "previous-rollout-model"; let previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), @@ -1187,6 +1194,7 @@ async fn record_initial_history_resumed_replaced_incomplete_compacted_turn_clear let previous_model = "previous-rollout-model"; let previous_context_item = TurnContextItem { turn_id: Some(turn_context.sub_id.clone()), + trace_id: turn_context.trace_id.clone(), cwd: turn_context.cwd.clone(), current_date: turn_context.current_date.clone(), timezone: turn_context.timezone.clone(), diff --git a/codex-rs/core/src/rollout/recorder.rs b/codex-rs/core/src/rollout/recorder.rs index 23edc57ae..e577abdef 100644 --- a/codex-rs/core/src/rollout/recorder.rs +++ b/codex-rs/core/src/rollout/recorder.rs @@ -1404,6 +1404,7 @@ mod tests { timestamp: "2025-01-03T13:00:01Z".to_string(), item: RolloutItem::TurnContext(TurnContextItem { turn_id: Some("turn-1".to_string()), + trace_id: None, cwd: latest_cwd.clone(), current_date: None, timezone: None, diff --git a/codex-rs/core/tests/suite/resume_warning.rs b/codex-rs/core/tests/suite/resume_warning.rs index ca40c0c19..fcf2bf8e0 100644 --- a/codex-rs/core/tests/suite/resume_warning.rs +++ b/codex-rs/core/tests/suite/resume_warning.rs @@ -27,6 +27,7 @@ fn resume_history( let turn_id = "resume-warning-seed-turn".to_string(); let turn_ctx = TurnContextItem { turn_id: Some(turn_id.clone()), + trace_id: None, cwd: config.cwd.clone(), current_date: None, timezone: None, diff --git a/codex-rs/otel/src/lib.rs b/codex-rs/otel/src/lib.rs index 217099524..daae8ce03 100644 --- a/codex-rs/otel/src/lib.rs +++ b/codex-rs/otel/src/lib.rs @@ -26,6 +26,7 @@ pub use crate::metrics::runtime_metrics::RuntimeMetricTotals; pub use crate::metrics::runtime_metrics::RuntimeMetricsSummary; pub use crate::otel_provider::traceparent_context_from_env; pub use crate::trace_context::context_from_w3c_trace_context; +pub use crate::trace_context::current_span_trace_id; pub use crate::trace_context::current_span_w3c_trace_context; pub use crate::trace_context::set_parent_from_context; pub use crate::trace_context::set_parent_from_w3c_trace_context; diff --git a/codex-rs/otel/src/trace_context.rs b/codex-rs/otel/src/trace_context.rs index 3ef38062d..f3ce0dd52 100644 --- a/codex-rs/otel/src/trace_context.rs +++ b/codex-rs/otel/src/trace_context.rs @@ -23,6 +23,17 @@ pub fn current_span_w3c_trace_context() -> Option { }) } +pub fn current_span_trace_id() -> Option { + let context = Span::current().context(); + let span = context.span(); + let span_context = span.span_context(); + if !span_context.is_valid() { + return None; + } + + Some(span_context.trace_id().to_string()) +} + pub fn context_from_w3c_trace_context(trace: &W3cTraceContext) -> Option { context_from_trace_headers(trace.traceparent.as_deref(), trace.tracestate.as_deref()) } @@ -62,11 +73,17 @@ pub(crate) fn context_from_trace_headers( mod tests { use super::context_from_trace_headers; use super::context_from_w3c_trace_context; + use super::current_span_trace_id; use codex_protocol::protocol::W3cTraceContext; use opentelemetry::trace::SpanId; use opentelemetry::trace::TraceContextExt; use opentelemetry::trace::TraceId; + use opentelemetry::trace::TracerProvider as _; + use opentelemetry_sdk::trace::SdkTracerProvider; use pretty_assertions::assert_eq; + use tracing::trace_span; + use tracing_subscriber::layer::SubscriberExt; + use tracing_subscriber::util::SubscriberInitExt; #[test] fn parses_valid_w3c_trace_context() { @@ -103,4 +120,21 @@ mod tests { .is_none() ); } + + #[test] + fn current_span_trace_id_returns_hex_trace_id() { + let provider = SdkTracerProvider::builder().build(); + let tracer = provider.tracer("codex-otel-tests"); + let subscriber = + tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer)); + let _guard = subscriber.set_default(); + + let span = trace_span!("test_span"); + let _entered = span.enter(); + let trace_id = current_span_trace_id().expect("trace id"); + + assert_eq!(trace_id.len(), 32); + assert!(trace_id.chars().all(|ch| ch.is_ascii_hexdigit())); + assert_ne!(trace_id, "00000000000000000000000000000000"); + } } diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 3d3abaf82..8e7e9881e 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -2220,6 +2220,8 @@ pub struct TurnContextNetworkItem { pub struct TurnContextItem { #[serde(default, skip_serializing_if = "Option::is_none")] pub turn_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub trace_id: Option, pub cwd: PathBuf, #[serde(default, skip_serializing_if = "Option::is_none")] pub current_date: Option, @@ -3509,6 +3511,7 @@ mod tests { "summary": "auto", }))?; + assert_eq!(item.trace_id, None); assert_eq!(item.network, None); Ok(()) } @@ -3517,6 +3520,7 @@ mod tests { fn turn_context_item_serializes_network_when_present() -> Result<()> { let item = TurnContextItem { turn_id: None, + trace_id: None, cwd: PathBuf::from("/tmp"), current_date: None, timezone: None, diff --git a/codex-rs/state/src/extract.rs b/codex-rs/state/src/extract.rs index 54ef7ea66..1a7fb3724 100644 --- a/codex-rs/state/src/extract.rs +++ b/codex-rs/state/src/extract.rs @@ -252,6 +252,7 @@ mod tests { &mut metadata, &RolloutItem::TurnContext(TurnContextItem { turn_id: Some("turn-1".to_string()), + trace_id: None, cwd: PathBuf::from("/parent/workspace"), current_date: None, timezone: None, @@ -289,6 +290,7 @@ mod tests { &mut metadata, &RolloutItem::TurnContext(TurnContextItem { turn_id: Some("turn-1".to_string()), + trace_id: None, cwd: PathBuf::from("/fallback/workspace"), current_date: None, timezone: None, diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs index b9a1a5c47..4291881fe 100644 --- a/codex-rs/tui/src/lib.rs +++ b/codex-rs/tui/src/lib.rs @@ -1271,6 +1271,7 @@ mod tests { .unwrap_or_else(|| "gpt-5.1".to_string()); TurnContextItem { turn_id: None, + trace_id: None, cwd, current_date: None, timezone: None,