feat(otel): Centralize OTEL metric names and shared tag builders (#14117)

This cleans up a bunch of metric plumbing that had started to drift.

The main change is making `codex-otel` the canonical home for shared
metric definitions and metric tag helpers. I moved the `turn/thread`
metric names that were still duplicated into the OTEL metric registry,
added a shared `metrics::tags` module for common tag keys and session
tag construction, and updated `SessionTelemetry` to build its metadata
tags through that shared path.

On the codex-core side, TTFT/TTFM now use the shared metric-name
constants instead of local string definitions. I also switched the
obvious remaining turn/thread metric callsites over to the shared
constants, and added a small helper so TTFT/TTFM can attach an optional
sanitized client.name tag from TurnContext.

This should make follow-on telemetry work less ad hoc:
- one canonical place for metric names
- one canonical place for common metric tag keys/builders
- less duplication between `codex-core` and `codex-otel`
This commit is contained in:
Owen Lin 2026-03-09 12:46:42 -07:00 committed by GitHub
parent 6ad448b658
commit da991bdf3a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 156 additions and 67 deletions

View file

@ -305,6 +305,7 @@ use crate::windows_sandbox::WindowsSandboxLevelExt;
use codex_async_utils::OrCancelExt;
use codex_otel::SessionTelemetry;
use codex_otel::TelemetryAuthMode;
use codex_otel::metrics::names::THREAD_STARTED_METRIC;
use codex_protocol::config_types::CollaborationMode;
use codex_protocol::config_types::Personality;
use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig;
@ -1412,7 +1413,7 @@ impl Session {
};
config.features.emit_metrics(&session_telemetry);
session_telemetry.counter(
"codex.thread.started",
THREAD_STARTED_METRIC,
1,
&[(
"is_git",

View file

@ -33,6 +33,9 @@ use crate::protocol::TurnCompleteEvent;
use crate::state::ActiveTurn;
use crate::state::RunningTask;
use crate::state::TaskKind;
use codex_otel::metrics::names::TURN_E2E_DURATION_METRIC;
use codex_otel::metrics::names::TURN_TOKEN_USAGE_METRIC;
use codex_otel::metrics::names::TURN_TOOL_CALL_METRIC;
use codex_protocol::items::TurnItem;
use codex_protocol::models::ContentItem;
use codex_protocol::models::ResponseInputItem;
@ -145,7 +148,7 @@ impl Session {
let timer = turn_context
.session_telemetry
.start_timer("codex.turn.e2e_duration_ms", &[])
.start_timer(TURN_E2E_DURATION_METRIC, &[])
.ok();
let done_clone = Arc::clone(&done);
@ -278,7 +281,7 @@ impl Session {
},
);
self.services.session_telemetry.histogram(
"codex.turn.tool.call",
TURN_TOOL_CALL_METRIC,
i64::try_from(turn_tool_calls).unwrap_or(i64::MAX),
&[tmp_mem],
);
@ -301,27 +304,27 @@ impl Session {
.max(0),
};
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.total_tokens,
&[("token_type", "total"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.input_tokens,
&[("token_type", "input"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.cached_input(),
&[("token_type", "cached_input"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.output_tokens,
&[("token_type", "output"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.reasoning_output_tokens,
&[("token_type", "reasoning_output"), tmp_mem],
);

View file

@ -1,6 +1,8 @@
use std::time::Duration;
use std::time::Instant;
use codex_otel::metrics::names::TURN_TTFM_DURATION_METRIC;
use codex_otel::metrics::names::TURN_TTFT_DURATION_METRIC;
use codex_protocol::items::TurnItem;
use codex_protocol::models::ResponseItem;
use tokio::sync::Mutex;
@ -9,9 +11,6 @@ use crate::ResponseEvent;
use crate::codex::TurnContext;
use crate::stream_events_utils::raw_assistant_output_text_from_item;
const TURN_TTFT_DURATION_METRIC: &str = "codex.turn.ttft.duration_ms";
const TURN_TTFM_DURATION_METRIC: &str = "codex.turn.ttfm.duration_ms";
pub(crate) async fn record_turn_ttft_metric(turn_context: &TurnContext, event: &ResponseEvent) {
let Some(duration) = turn_context
.turn_timing_state

View file

@ -24,9 +24,8 @@ use crate::metrics::names::WEBSOCKET_EVENT_DURATION_METRIC;
use crate::metrics::names::WEBSOCKET_REQUEST_COUNT_METRIC;
use crate::metrics::names::WEBSOCKET_REQUEST_DURATION_METRIC;
use crate::metrics::runtime_metrics::RuntimeMetricsSummary;
use crate::metrics::tags::SessionMetricTagValues;
use crate::metrics::timer::Timer;
use crate::metrics::validation::validate_tag_key;
use crate::metrics::validation::validate_tag_value;
use crate::provider::OtelProvider;
use crate::sanitize_metric_tag_value;
use codex_api::ApiError;
@ -228,40 +227,15 @@ impl SessionTelemetry {
if !self.metrics_use_metadata_tags {
return Ok(Vec::new());
}
let mut tags = Vec::with_capacity(7);
Self::push_metadata_tag(&mut tags, "auth_mode", self.metadata.auth_mode.as_deref())?;
Self::push_metadata_tag(
&mut tags,
"session_source",
Some(self.metadata.session_source.as_str()),
)?;
Self::push_metadata_tag(
&mut tags,
"originator",
Some(self.metadata.originator.as_str()),
)?;
Self::push_metadata_tag(
&mut tags,
"service_name",
self.metadata.service_name.as_deref(),
)?;
Self::push_metadata_tag(&mut tags, "model", Some(self.metadata.model.as_str()))?;
Self::push_metadata_tag(&mut tags, "app.version", Some(self.metadata.app_version))?;
Ok(tags)
}
fn push_metadata_tag<'a>(
tags: &mut Vec<(&'a str, &'a str)>,
key: &'static str,
value: Option<&'a str>,
) -> MetricsResult<()> {
let Some(value) = value else {
return Ok(());
};
validate_tag_key(key)?;
validate_tag_value(value)?;
tags.push((key, value));
Ok(())
SessionMetricTagValues {
auth_mode: self.metadata.auth_mode.as_deref(),
session_source: self.metadata.session_source.as_str(),
originator: self.metadata.originator.as_str(),
service_name: self.metadata.service_name.as_deref(),
model: self.metadata.model.as_str(),
app_version: self.metadata.app_version,
}
.into_tags()
}
#[allow(clippy::too_many_arguments)]

View file

@ -1,8 +1,9 @@
mod client;
mod config;
mod error;
pub(crate) mod names;
pub mod names;
pub(crate) mod runtime_metrics;
pub mod tags;
pub(crate) mod timer;
pub(crate) mod validation;

View file

@ -1,24 +1,27 @@
pub(crate) const TOOL_CALL_COUNT_METRIC: &str = "codex.tool.call";
pub(crate) const TOOL_CALL_DURATION_METRIC: &str = "codex.tool.call.duration_ms";
pub(crate) const API_CALL_COUNT_METRIC: &str = "codex.api_request";
pub(crate) const API_CALL_DURATION_METRIC: &str = "codex.api_request.duration_ms";
pub(crate) const SSE_EVENT_COUNT_METRIC: &str = "codex.sse_event";
pub(crate) const SSE_EVENT_DURATION_METRIC: &str = "codex.sse_event.duration_ms";
pub(crate) const WEBSOCKET_REQUEST_COUNT_METRIC: &str = "codex.websocket.request";
pub(crate) const WEBSOCKET_REQUEST_DURATION_METRIC: &str = "codex.websocket.request.duration_ms";
pub(crate) const WEBSOCKET_EVENT_COUNT_METRIC: &str = "codex.websocket.event";
pub(crate) const WEBSOCKET_EVENT_DURATION_METRIC: &str = "codex.websocket.event.duration_ms";
pub(crate) const RESPONSES_API_OVERHEAD_DURATION_METRIC: &str =
"codex.responses_api_overhead.duration_ms";
pub(crate) const RESPONSES_API_INFERENCE_TIME_DURATION_METRIC: &str =
pub const TOOL_CALL_COUNT_METRIC: &str = "codex.tool.call";
pub const TOOL_CALL_DURATION_METRIC: &str = "codex.tool.call.duration_ms";
pub const API_CALL_COUNT_METRIC: &str = "codex.api_request";
pub const API_CALL_DURATION_METRIC: &str = "codex.api_request.duration_ms";
pub const SSE_EVENT_COUNT_METRIC: &str = "codex.sse_event";
pub const SSE_EVENT_DURATION_METRIC: &str = "codex.sse_event.duration_ms";
pub const WEBSOCKET_REQUEST_COUNT_METRIC: &str = "codex.websocket.request";
pub const WEBSOCKET_REQUEST_DURATION_METRIC: &str = "codex.websocket.request.duration_ms";
pub const WEBSOCKET_EVENT_COUNT_METRIC: &str = "codex.websocket.event";
pub const WEBSOCKET_EVENT_DURATION_METRIC: &str = "codex.websocket.event.duration_ms";
pub const RESPONSES_API_OVERHEAD_DURATION_METRIC: &str = "codex.responses_api_overhead.duration_ms";
pub const RESPONSES_API_INFERENCE_TIME_DURATION_METRIC: &str =
"codex.responses_api_inference_time.duration_ms";
pub(crate) const RESPONSES_API_ENGINE_IAPI_TTFT_DURATION_METRIC: &str =
pub const RESPONSES_API_ENGINE_IAPI_TTFT_DURATION_METRIC: &str =
"codex.responses_api_engine_iapi_ttft.duration_ms";
pub(crate) const RESPONSES_API_ENGINE_SERVICE_TTFT_DURATION_METRIC: &str =
pub const RESPONSES_API_ENGINE_SERVICE_TTFT_DURATION_METRIC: &str =
"codex.responses_api_engine_service_ttft.duration_ms";
pub(crate) const RESPONSES_API_ENGINE_IAPI_TBT_DURATION_METRIC: &str =
pub const RESPONSES_API_ENGINE_IAPI_TBT_DURATION_METRIC: &str =
"codex.responses_api_engine_iapi_tbt.duration_ms";
pub(crate) const RESPONSES_API_ENGINE_SERVICE_TBT_DURATION_METRIC: &str =
pub const RESPONSES_API_ENGINE_SERVICE_TBT_DURATION_METRIC: &str =
"codex.responses_api_engine_service_tbt.duration_ms";
pub(crate) const TURN_TTFT_DURATION_METRIC: &str = "codex.turn.ttft.duration_ms";
pub(crate) const TURN_TTFM_DURATION_METRIC: &str = "codex.turn.ttfm.duration_ms";
pub const TURN_E2E_DURATION_METRIC: &str = "codex.turn.e2e_duration_ms";
pub const TURN_TTFT_DURATION_METRIC: &str = "codex.turn.ttft.duration_ms";
pub const TURN_TTFM_DURATION_METRIC: &str = "codex.turn.ttfm.duration_ms";
pub const TURN_TOOL_CALL_METRIC: &str = "codex.turn.tool.call";
pub const TURN_TOKEN_USAGE_METRIC: &str = "codex.turn.token_usage";
pub const THREAD_STARTED_METRIC: &str = "codex.thread.started";

View file

@ -0,0 +1,108 @@
use crate::metrics::Result;
use crate::metrics::validation::validate_tag_key;
use crate::metrics::validation::validate_tag_value;
pub const APP_VERSION_TAG: &str = "app.version";
pub const AUTH_MODE_TAG: &str = "auth_mode";
pub const MODEL_TAG: &str = "model";
pub const ORIGINATOR_TAG: &str = "originator";
pub const SERVICE_NAME_TAG: &str = "service_name";
pub const SESSION_SOURCE_TAG: &str = "session_source";
pub struct SessionMetricTagValues<'a> {
pub auth_mode: Option<&'a str>,
pub session_source: &'a str,
pub originator: &'a str,
pub service_name: Option<&'a str>,
pub model: &'a str,
pub app_version: &'a str,
}
impl<'a> SessionMetricTagValues<'a> {
pub fn into_tags(self) -> Result<Vec<(&'static str, &'a str)>> {
let mut tags = Vec::with_capacity(6);
Self::push_optional_tag(&mut tags, AUTH_MODE_TAG, self.auth_mode)?;
Self::push_optional_tag(&mut tags, SESSION_SOURCE_TAG, Some(self.session_source))?;
Self::push_optional_tag(&mut tags, ORIGINATOR_TAG, Some(self.originator))?;
Self::push_optional_tag(&mut tags, SERVICE_NAME_TAG, self.service_name)?;
Self::push_optional_tag(&mut tags, MODEL_TAG, Some(self.model))?;
Self::push_optional_tag(&mut tags, APP_VERSION_TAG, Some(self.app_version))?;
Ok(tags)
}
fn push_optional_tag(
tags: &mut Vec<(&'static str, &'a str)>,
key: &'static str,
value: Option<&'a str>,
) -> Result<()> {
let Some(value) = value else {
return Ok(());
};
validate_tag_key(key)?;
validate_tag_value(value)?;
tags.push((key, value));
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::APP_VERSION_TAG;
use super::AUTH_MODE_TAG;
use super::MODEL_TAG;
use super::ORIGINATOR_TAG;
use super::SERVICE_NAME_TAG;
use super::SESSION_SOURCE_TAG;
use super::SessionMetricTagValues;
use pretty_assertions::assert_eq;
#[test]
fn session_metric_tags_include_expected_tags_in_order() {
let tags = SessionMetricTagValues {
auth_mode: Some("api_key"),
session_source: "cli",
originator: "codex_cli",
service_name: Some("desktop_app"),
model: "gpt-5.1",
app_version: "1.2.3",
}
.into_tags()
.expect("tags");
assert_eq!(
tags,
vec![
(AUTH_MODE_TAG, "api_key"),
(SESSION_SOURCE_TAG, "cli"),
(ORIGINATOR_TAG, "codex_cli"),
(SERVICE_NAME_TAG, "desktop_app"),
(MODEL_TAG, "gpt-5.1"),
(APP_VERSION_TAG, "1.2.3"),
]
);
}
#[test]
fn session_metric_tags_skip_missing_optional_tags() {
let tags = SessionMetricTagValues {
auth_mode: None,
session_source: "exec",
originator: "codex_exec",
service_name: None,
model: "gpt-5.1",
app_version: "1.2.3",
}
.into_tags()
.expect("tags");
assert_eq!(
tags,
vec![
(SESSION_SOURCE_TAG, "exec"),
(ORIGINATOR_TAG, "codex_exec"),
(MODEL_TAG, "gpt-5.1"),
(APP_VERSION_TAG, "1.2.3"),
]
);
}
}