feat(core): persist trace_id for turns in RolloutItem::TurnContext (#13602)
This PR adds a durable trace linkage for each turn by storing the active trace ID on the rollout TurnContext record stored in session rollout files. Before this change, we propagated trace context at runtime but didn’t persist a stable per-turn trace key in rollout history. That made after-the-fact debugging harder (for example, mapping a historical turn to the corresponding trace in datadog). This sets us up for much easier debugging in the future. ### What changed - Added an optional `trace_id` to TurnContextItem (rollout schema). - Added a small OTEL helper to read the current span trace ID. - Captured `trace_id` when creating `TurnContext` and included it in `to_turn_context_item()`. - Updated tests and fixtures that construct TurnContextItem so older/no-trace cases still work. ### Why this approach TurnContext is already the canonical durable per-turn metadata in rollout. This keeps ownership clean: trace linkage lives with other persisted turn metadata.
This commit is contained in:
parent
cfbbbb1dda
commit
aa3fe8abf8
9 changed files with 97 additions and 0 deletions
|
|
@ -64,6 +64,7 @@ use codex_hooks::HooksConfig;
|
|||
use codex_network_proxy::NetworkProxy;
|
||||
use codex_network_proxy::NetworkProxyAuditMetadata;
|
||||
use codex_network_proxy::normalize_host;
|
||||
use codex_otel::current_span_trace_id;
|
||||
use codex_otel::current_span_w3c_trace_context;
|
||||
use codex_otel::set_parent_from_w3c_trace_context;
|
||||
use codex_protocol::ThreadId;
|
||||
|
|
@ -652,6 +653,7 @@ impl TurnSkillsContext {
|
|||
#[derive(Debug)]
|
||||
pub(crate) struct TurnContext {
|
||||
pub(crate) sub_id: String,
|
||||
pub(crate) trace_id: Option<String>,
|
||||
pub(crate) realtime_active: bool,
|
||||
pub(crate) config: Arc<Config>,
|
||||
pub(crate) auth_manager: Option<Arc<AuthManager>>,
|
||||
|
|
@ -740,6 +742,7 @@ impl TurnContext {
|
|||
|
||||
Self {
|
||||
sub_id: self.sub_id.clone(),
|
||||
trace_id: self.trace_id.clone(),
|
||||
realtime_active: self.realtime_active,
|
||||
config: Arc::new(config),
|
||||
auth_manager: self.auth_manager.clone(),
|
||||
|
|
@ -795,6 +798,7 @@ impl TurnContext {
|
|||
pub(crate) fn to_turn_context_item(&self) -> TurnContextItem {
|
||||
TurnContextItem {
|
||||
turn_id: Some(self.sub_id.clone()),
|
||||
trace_id: self.trace_id.clone(),
|
||||
cwd: self.cwd.clone(),
|
||||
current_date: self.current_date.clone(),
|
||||
timezone: self.timezone.clone(),
|
||||
|
|
@ -1124,6 +1128,7 @@ impl Session {
|
|||
let (current_date, timezone) = local_time_context();
|
||||
TurnContext {
|
||||
sub_id,
|
||||
trace_id: current_span_trace_id(),
|
||||
realtime_active: false,
|
||||
config: per_turn_config.clone(),
|
||||
auth_manager: auth_manager_for_context,
|
||||
|
|
@ -4773,6 +4778,7 @@ async fn spawn_review_thread(
|
|||
|
||||
let review_turn_context = TurnContext {
|
||||
sub_id: review_turn_id,
|
||||
trace_id: current_span_trace_id(),
|
||||
realtime_active: parent_turn_context.realtime_active,
|
||||
config: per_turn_config,
|
||||
auth_manager: auth_manager_for_context,
|
||||
|
|
@ -6652,6 +6658,7 @@ mod tests {
|
|||
use codex_protocol::ThreadId;
|
||||
use codex_protocol::models::FunctionCallOutputBody;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use tracing::Span;
|
||||
|
||||
use crate::protocol::CompactedItem;
|
||||
use crate::protocol::CreditsSnapshot;
|
||||
|
|
@ -7589,6 +7596,7 @@ mod tests {
|
|||
let previous_model = "forked-rollout-model";
|
||||
let previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -8563,6 +8571,43 @@ mod tests {
|
|||
assert_eq!(submitted.trace, Some(expected_trace));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn new_default_turn_captures_current_span_trace_id() {
|
||||
let (session, _turn_context) = make_session_and_context().await;
|
||||
|
||||
init_test_tracing();
|
||||
|
||||
let request_parent = W3cTraceContext {
|
||||
traceparent: Some("00-00000000000000000000000000000011-0000000000000022-01".into()),
|
||||
tracestate: Some("vendor=value".into()),
|
||||
};
|
||||
let request_span = info_span!("app_server.request");
|
||||
assert!(set_parent_from_w3c_trace_context(
|
||||
&request_span,
|
||||
&request_parent
|
||||
));
|
||||
|
||||
let turn_context_item = async {
|
||||
let expected_trace_id = Span::current()
|
||||
.context()
|
||||
.span()
|
||||
.span_context()
|
||||
.trace_id()
|
||||
.to_string();
|
||||
let turn_context = session.new_default_turn().await;
|
||||
let turn_context_item = turn_context.to_turn_context_item();
|
||||
assert_eq!(turn_context_item.trace_id, Some(expected_trace_id));
|
||||
turn_context_item
|
||||
}
|
||||
.instrument(request_span)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
turn_context_item.trace_id.as_deref(),
|
||||
Some("00000000000000000000000000000011")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn submission_dispatch_span_prefers_submission_trace_context() {
|
||||
init_test_tracing();
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ async fn record_initial_history_resumed_bare_turn_context_does_not_hydrate_previ
|
|||
let previous_model = "previous-rollout-model";
|
||||
let previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -78,6 +79,7 @@ async fn record_initial_history_resumed_hydrates_previous_turn_settings_from_lif
|
|||
let previous_model = "previous-rollout-model";
|
||||
let mut previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -739,6 +741,7 @@ async fn record_initial_history_resumed_turn_context_after_compaction_reestablis
|
|||
let previous_model = "previous-rollout-model";
|
||||
let previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -810,6 +813,7 @@ async fn record_initial_history_resumed_turn_context_after_compaction_reestablis
|
|||
.expect("serialize seeded reference context item"),
|
||||
serde_json::to_value(Some(TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -838,6 +842,7 @@ async fn record_initial_history_resumed_aborted_turn_without_id_clears_active_tu
|
|||
let previous_model = "previous-rollout-model";
|
||||
let previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -943,6 +948,7 @@ async fn record_initial_history_resumed_unmatched_abort_preserves_active_turn_fo
|
|||
let unmatched_abort_turn_id = "other-turn".to_string();
|
||||
let current_context_item = TurnContextItem {
|
||||
turn_id: Some(current_turn_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -1044,6 +1050,7 @@ async fn record_initial_history_resumed_trailing_incomplete_turn_compaction_clea
|
|||
let previous_model = "previous-rollout-model";
|
||||
let previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
@ -1187,6 +1194,7 @@ async fn record_initial_history_resumed_replaced_incomplete_compacted_turn_clear
|
|||
let previous_model = "previous-rollout-model";
|
||||
let previous_context_item = TurnContextItem {
|
||||
turn_id: Some(turn_context.sub_id.clone()),
|
||||
trace_id: turn_context.trace_id.clone(),
|
||||
cwd: turn_context.cwd.clone(),
|
||||
current_date: turn_context.current_date.clone(),
|
||||
timezone: turn_context.timezone.clone(),
|
||||
|
|
|
|||
|
|
@ -1404,6 +1404,7 @@ mod tests {
|
|||
timestamp: "2025-01-03T13:00:01Z".to_string(),
|
||||
item: RolloutItem::TurnContext(TurnContextItem {
|
||||
turn_id: Some("turn-1".to_string()),
|
||||
trace_id: None,
|
||||
cwd: latest_cwd.clone(),
|
||||
current_date: None,
|
||||
timezone: None,
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ fn resume_history(
|
|||
let turn_id = "resume-warning-seed-turn".to_string();
|
||||
let turn_ctx = TurnContextItem {
|
||||
turn_id: Some(turn_id.clone()),
|
||||
trace_id: None,
|
||||
cwd: config.cwd.clone(),
|
||||
current_date: None,
|
||||
timezone: None,
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ pub use crate::metrics::runtime_metrics::RuntimeMetricTotals;
|
|||
pub use crate::metrics::runtime_metrics::RuntimeMetricsSummary;
|
||||
pub use crate::otel_provider::traceparent_context_from_env;
|
||||
pub use crate::trace_context::context_from_w3c_trace_context;
|
||||
pub use crate::trace_context::current_span_trace_id;
|
||||
pub use crate::trace_context::current_span_w3c_trace_context;
|
||||
pub use crate::trace_context::set_parent_from_context;
|
||||
pub use crate::trace_context::set_parent_from_w3c_trace_context;
|
||||
|
|
|
|||
|
|
@ -23,6 +23,17 @@ pub fn current_span_w3c_trace_context() -> Option<W3cTraceContext> {
|
|||
})
|
||||
}
|
||||
|
||||
pub fn current_span_trace_id() -> Option<String> {
|
||||
let context = Span::current().context();
|
||||
let span = context.span();
|
||||
let span_context = span.span_context();
|
||||
if !span_context.is_valid() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(span_context.trace_id().to_string())
|
||||
}
|
||||
|
||||
pub fn context_from_w3c_trace_context(trace: &W3cTraceContext) -> Option<Context> {
|
||||
context_from_trace_headers(trace.traceparent.as_deref(), trace.tracestate.as_deref())
|
||||
}
|
||||
|
|
@ -62,11 +73,17 @@ pub(crate) fn context_from_trace_headers(
|
|||
mod tests {
|
||||
use super::context_from_trace_headers;
|
||||
use super::context_from_w3c_trace_context;
|
||||
use super::current_span_trace_id;
|
||||
use codex_protocol::protocol::W3cTraceContext;
|
||||
use opentelemetry::trace::SpanId;
|
||||
use opentelemetry::trace::TraceContextExt;
|
||||
use opentelemetry::trace::TraceId;
|
||||
use opentelemetry::trace::TracerProvider as _;
|
||||
use opentelemetry_sdk::trace::SdkTracerProvider;
|
||||
use pretty_assertions::assert_eq;
|
||||
use tracing::trace_span;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
|
||||
#[test]
|
||||
fn parses_valid_w3c_trace_context() {
|
||||
|
|
@ -103,4 +120,21 @@ mod tests {
|
|||
.is_none()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn current_span_trace_id_returns_hex_trace_id() {
|
||||
let provider = SdkTracerProvider::builder().build();
|
||||
let tracer = provider.tracer("codex-otel-tests");
|
||||
let subscriber =
|
||||
tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer));
|
||||
let _guard = subscriber.set_default();
|
||||
|
||||
let span = trace_span!("test_span");
|
||||
let _entered = span.enter();
|
||||
let trace_id = current_span_trace_id().expect("trace id");
|
||||
|
||||
assert_eq!(trace_id.len(), 32);
|
||||
assert!(trace_id.chars().all(|ch| ch.is_ascii_hexdigit()));
|
||||
assert_ne!(trace_id, "00000000000000000000000000000000");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2220,6 +2220,8 @@ pub struct TurnContextNetworkItem {
|
|||
pub struct TurnContextItem {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub turn_id: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub trace_id: Option<String>,
|
||||
pub cwd: PathBuf,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub current_date: Option<String>,
|
||||
|
|
@ -3509,6 +3511,7 @@ mod tests {
|
|||
"summary": "auto",
|
||||
}))?;
|
||||
|
||||
assert_eq!(item.trace_id, None);
|
||||
assert_eq!(item.network, None);
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -3517,6 +3520,7 @@ mod tests {
|
|||
fn turn_context_item_serializes_network_when_present() -> Result<()> {
|
||||
let item = TurnContextItem {
|
||||
turn_id: None,
|
||||
trace_id: None,
|
||||
cwd: PathBuf::from("/tmp"),
|
||||
current_date: None,
|
||||
timezone: None,
|
||||
|
|
|
|||
|
|
@ -252,6 +252,7 @@ mod tests {
|
|||
&mut metadata,
|
||||
&RolloutItem::TurnContext(TurnContextItem {
|
||||
turn_id: Some("turn-1".to_string()),
|
||||
trace_id: None,
|
||||
cwd: PathBuf::from("/parent/workspace"),
|
||||
current_date: None,
|
||||
timezone: None,
|
||||
|
|
@ -289,6 +290,7 @@ mod tests {
|
|||
&mut metadata,
|
||||
&RolloutItem::TurnContext(TurnContextItem {
|
||||
turn_id: Some("turn-1".to_string()),
|
||||
trace_id: None,
|
||||
cwd: PathBuf::from("/fallback/workspace"),
|
||||
current_date: None,
|
||||
timezone: None,
|
||||
|
|
|
|||
|
|
@ -1271,6 +1271,7 @@ mod tests {
|
|||
.unwrap_or_else(|| "gpt-5.1".to_string());
|
||||
TurnContextItem {
|
||||
turn_id: None,
|
||||
trace_id: None,
|
||||
cwd,
|
||||
current_date: None,
|
||||
timezone: None,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue