feat(core): persist trace_id for turns in RolloutItem::TurnContext (#13602)

This PR adds a durable trace linkage for each turn by storing the active
trace ID on the rollout TurnContext record stored in session rollout
files.

Before this change, we propagated trace context at runtime but didn’t
persist a stable per-turn trace key in rollout history. That made
after-the-fact debugging harder (for example, mapping a historical turn
to the corresponding trace in datadog). This sets us up for much easier
debugging in the future.

### What changed
- Added an optional `trace_id` to TurnContextItem (rollout schema).
- Added a small OTEL helper to read the current span trace ID.
- Captured `trace_id` when creating `TurnContext` and included it in
`to_turn_context_item()`.
- Updated tests and fixtures that construct TurnContextItem so
older/no-trace cases still work.

### Why this approach
TurnContext is already the canonical durable per-turn metadata in
rollout. This keeps ownership clean: trace linkage lives with other
persisted turn metadata.
This commit is contained in:
Owen Lin 2026-03-05 13:26:48 -08:00 committed by GitHub
parent cfbbbb1dda
commit aa3fe8abf8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 97 additions and 0 deletions

View file

@ -64,6 +64,7 @@ use codex_hooks::HooksConfig;
use codex_network_proxy::NetworkProxy;
use codex_network_proxy::NetworkProxyAuditMetadata;
use codex_network_proxy::normalize_host;
use codex_otel::current_span_trace_id;
use codex_otel::current_span_w3c_trace_context;
use codex_otel::set_parent_from_w3c_trace_context;
use codex_protocol::ThreadId;
@ -652,6 +653,7 @@ impl TurnSkillsContext {
#[derive(Debug)]
pub(crate) struct TurnContext {
pub(crate) sub_id: String,
pub(crate) trace_id: Option<String>,
pub(crate) realtime_active: bool,
pub(crate) config: Arc<Config>,
pub(crate) auth_manager: Option<Arc<AuthManager>>,
@ -740,6 +742,7 @@ impl TurnContext {
Self {
sub_id: self.sub_id.clone(),
trace_id: self.trace_id.clone(),
realtime_active: self.realtime_active,
config: Arc::new(config),
auth_manager: self.auth_manager.clone(),
@ -795,6 +798,7 @@ impl TurnContext {
pub(crate) fn to_turn_context_item(&self) -> TurnContextItem {
TurnContextItem {
turn_id: Some(self.sub_id.clone()),
trace_id: self.trace_id.clone(),
cwd: self.cwd.clone(),
current_date: self.current_date.clone(),
timezone: self.timezone.clone(),
@ -1124,6 +1128,7 @@ impl Session {
let (current_date, timezone) = local_time_context();
TurnContext {
sub_id,
trace_id: current_span_trace_id(),
realtime_active: false,
config: per_turn_config.clone(),
auth_manager: auth_manager_for_context,
@ -4773,6 +4778,7 @@ async fn spawn_review_thread(
let review_turn_context = TurnContext {
sub_id: review_turn_id,
trace_id: current_span_trace_id(),
realtime_active: parent_turn_context.realtime_active,
config: per_turn_config,
auth_manager: auth_manager_for_context,
@ -6652,6 +6658,7 @@ mod tests {
use codex_protocol::ThreadId;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::FunctionCallOutputPayload;
use tracing::Span;
use crate::protocol::CompactedItem;
use crate::protocol::CreditsSnapshot;
@ -7589,6 +7596,7 @@ mod tests {
let previous_model = "forked-rollout-model";
let previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -8563,6 +8571,43 @@ mod tests {
assert_eq!(submitted.trace, Some(expected_trace));
}
#[tokio::test]
async fn new_default_turn_captures_current_span_trace_id() {
let (session, _turn_context) = make_session_and_context().await;
init_test_tracing();
let request_parent = W3cTraceContext {
traceparent: Some("00-00000000000000000000000000000011-0000000000000022-01".into()),
tracestate: Some("vendor=value".into()),
};
let request_span = info_span!("app_server.request");
assert!(set_parent_from_w3c_trace_context(
&request_span,
&request_parent
));
let turn_context_item = async {
let expected_trace_id = Span::current()
.context()
.span()
.span_context()
.trace_id()
.to_string();
let turn_context = session.new_default_turn().await;
let turn_context_item = turn_context.to_turn_context_item();
assert_eq!(turn_context_item.trace_id, Some(expected_trace_id));
turn_context_item
}
.instrument(request_span)
.await;
assert_eq!(
turn_context_item.trace_id.as_deref(),
Some("00000000000000000000000000000011")
);
}
#[test]
fn submission_dispatch_span_prefers_submission_trace_context() {
init_test_tracing();

View file

@ -40,6 +40,7 @@ async fn record_initial_history_resumed_bare_turn_context_does_not_hydrate_previ
let previous_model = "previous-rollout-model";
let previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -78,6 +79,7 @@ async fn record_initial_history_resumed_hydrates_previous_turn_settings_from_lif
let previous_model = "previous-rollout-model";
let mut previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -739,6 +741,7 @@ async fn record_initial_history_resumed_turn_context_after_compaction_reestablis
let previous_model = "previous-rollout-model";
let previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -810,6 +813,7 @@ async fn record_initial_history_resumed_turn_context_after_compaction_reestablis
.expect("serialize seeded reference context item"),
serde_json::to_value(Some(TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -838,6 +842,7 @@ async fn record_initial_history_resumed_aborted_turn_without_id_clears_active_tu
let previous_model = "previous-rollout-model";
let previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -943,6 +948,7 @@ async fn record_initial_history_resumed_unmatched_abort_preserves_active_turn_fo
let unmatched_abort_turn_id = "other-turn".to_string();
let current_context_item = TurnContextItem {
turn_id: Some(current_turn_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -1044,6 +1050,7 @@ async fn record_initial_history_resumed_trailing_incomplete_turn_compaction_clea
let previous_model = "previous-rollout-model";
let previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),
@ -1187,6 +1194,7 @@ async fn record_initial_history_resumed_replaced_incomplete_compacted_turn_clear
let previous_model = "previous-rollout-model";
let previous_context_item = TurnContextItem {
turn_id: Some(turn_context.sub_id.clone()),
trace_id: turn_context.trace_id.clone(),
cwd: turn_context.cwd.clone(),
current_date: turn_context.current_date.clone(),
timezone: turn_context.timezone.clone(),

View file

@ -1404,6 +1404,7 @@ mod tests {
timestamp: "2025-01-03T13:00:01Z".to_string(),
item: RolloutItem::TurnContext(TurnContextItem {
turn_id: Some("turn-1".to_string()),
trace_id: None,
cwd: latest_cwd.clone(),
current_date: None,
timezone: None,

View file

@ -27,6 +27,7 @@ fn resume_history(
let turn_id = "resume-warning-seed-turn".to_string();
let turn_ctx = TurnContextItem {
turn_id: Some(turn_id.clone()),
trace_id: None,
cwd: config.cwd.clone(),
current_date: None,
timezone: None,

View file

@ -26,6 +26,7 @@ pub use crate::metrics::runtime_metrics::RuntimeMetricTotals;
pub use crate::metrics::runtime_metrics::RuntimeMetricsSummary;
pub use crate::otel_provider::traceparent_context_from_env;
pub use crate::trace_context::context_from_w3c_trace_context;
pub use crate::trace_context::current_span_trace_id;
pub use crate::trace_context::current_span_w3c_trace_context;
pub use crate::trace_context::set_parent_from_context;
pub use crate::trace_context::set_parent_from_w3c_trace_context;

View file

@ -23,6 +23,17 @@ pub fn current_span_w3c_trace_context() -> Option<W3cTraceContext> {
})
}
pub fn current_span_trace_id() -> Option<String> {
let context = Span::current().context();
let span = context.span();
let span_context = span.span_context();
if !span_context.is_valid() {
return None;
}
Some(span_context.trace_id().to_string())
}
pub fn context_from_w3c_trace_context(trace: &W3cTraceContext) -> Option<Context> {
context_from_trace_headers(trace.traceparent.as_deref(), trace.tracestate.as_deref())
}
@ -62,11 +73,17 @@ pub(crate) fn context_from_trace_headers(
mod tests {
use super::context_from_trace_headers;
use super::context_from_w3c_trace_context;
use super::current_span_trace_id;
use codex_protocol::protocol::W3cTraceContext;
use opentelemetry::trace::SpanId;
use opentelemetry::trace::TraceContextExt;
use opentelemetry::trace::TraceId;
use opentelemetry::trace::TracerProvider as _;
use opentelemetry_sdk::trace::SdkTracerProvider;
use pretty_assertions::assert_eq;
use tracing::trace_span;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
#[test]
fn parses_valid_w3c_trace_context() {
@ -103,4 +120,21 @@ mod tests {
.is_none()
);
}
#[test]
fn current_span_trace_id_returns_hex_trace_id() {
let provider = SdkTracerProvider::builder().build();
let tracer = provider.tracer("codex-otel-tests");
let subscriber =
tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer));
let _guard = subscriber.set_default();
let span = trace_span!("test_span");
let _entered = span.enter();
let trace_id = current_span_trace_id().expect("trace id");
assert_eq!(trace_id.len(), 32);
assert!(trace_id.chars().all(|ch| ch.is_ascii_hexdigit()));
assert_ne!(trace_id, "00000000000000000000000000000000");
}
}

View file

@ -2220,6 +2220,8 @@ pub struct TurnContextNetworkItem {
pub struct TurnContextItem {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub turn_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub trace_id: Option<String>,
pub cwd: PathBuf,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub current_date: Option<String>,
@ -3509,6 +3511,7 @@ mod tests {
"summary": "auto",
}))?;
assert_eq!(item.trace_id, None);
assert_eq!(item.network, None);
Ok(())
}
@ -3517,6 +3520,7 @@ mod tests {
fn turn_context_item_serializes_network_when_present() -> Result<()> {
let item = TurnContextItem {
turn_id: None,
trace_id: None,
cwd: PathBuf::from("/tmp"),
current_date: None,
timezone: None,

View file

@ -252,6 +252,7 @@ mod tests {
&mut metadata,
&RolloutItem::TurnContext(TurnContextItem {
turn_id: Some("turn-1".to_string()),
trace_id: None,
cwd: PathBuf::from("/parent/workspace"),
current_date: None,
timezone: None,
@ -289,6 +290,7 @@ mod tests {
&mut metadata,
&RolloutItem::TurnContext(TurnContextItem {
turn_id: Some("turn-1".to_string()),
trace_id: None,
cwd: PathBuf::from("/fallback/workspace"),
current_date: None,
timezone: None,

View file

@ -1271,6 +1271,7 @@ mod tests {
.unwrap_or_else(|| "gpt-5.1".to_string());
TurnContextItem {
turn_id: None,
trace_id: None,
cwd,
current_date: None,
timezone: None,