feat(otel): Centralize OTEL metric names and shared tag builders (#14117)
This cleans up a bunch of metric plumbing that had started to drift. The main change is making `codex-otel` the canonical home for shared metric definitions and metric tag helpers. I moved the `turn/thread` metric names that were still duplicated into the OTEL metric registry, added a shared `metrics::tags` module for common tag keys and session tag construction, and updated `SessionTelemetry` to build its metadata tags through that shared path. On the codex-core side, TTFT/TTFM now use the shared metric-name constants instead of local string definitions. I also switched the obvious remaining turn/thread metric callsites over to the shared constants, and added a small helper so TTFT/TTFM can attach an optional sanitized client.name tag from TurnContext. This should make follow-on telemetry work less ad hoc: - one canonical place for metric names - one canonical place for common metric tag keys/builders - less duplication between `codex-core` and `codex-otel`
This commit is contained in:
parent
6ad448b658
commit
da991bdf3a
7 changed files with 156 additions and 67 deletions
|
|
@ -305,6 +305,7 @@ use crate::windows_sandbox::WindowsSandboxLevelExt;
|
|||
use codex_async_utils::OrCancelExt;
|
||||
use codex_otel::SessionTelemetry;
|
||||
use codex_otel::TelemetryAuthMode;
|
||||
use codex_otel::metrics::names::THREAD_STARTED_METRIC;
|
||||
use codex_protocol::config_types::CollaborationMode;
|
||||
use codex_protocol::config_types::Personality;
|
||||
use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig;
|
||||
|
|
@ -1412,7 +1413,7 @@ impl Session {
|
|||
};
|
||||
config.features.emit_metrics(&session_telemetry);
|
||||
session_telemetry.counter(
|
||||
"codex.thread.started",
|
||||
THREAD_STARTED_METRIC,
|
||||
1,
|
||||
&[(
|
||||
"is_git",
|
||||
|
|
|
|||
|
|
@ -33,6 +33,9 @@ use crate::protocol::TurnCompleteEvent;
|
|||
use crate::state::ActiveTurn;
|
||||
use crate::state::RunningTask;
|
||||
use crate::state::TaskKind;
|
||||
use codex_otel::metrics::names::TURN_E2E_DURATION_METRIC;
|
||||
use codex_otel::metrics::names::TURN_TOKEN_USAGE_METRIC;
|
||||
use codex_otel::metrics::names::TURN_TOOL_CALL_METRIC;
|
||||
use codex_protocol::items::TurnItem;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::ResponseInputItem;
|
||||
|
|
@ -145,7 +148,7 @@ impl Session {
|
|||
|
||||
let timer = turn_context
|
||||
.session_telemetry
|
||||
.start_timer("codex.turn.e2e_duration_ms", &[])
|
||||
.start_timer(TURN_E2E_DURATION_METRIC, &[])
|
||||
.ok();
|
||||
|
||||
let done_clone = Arc::clone(&done);
|
||||
|
|
@ -278,7 +281,7 @@ impl Session {
|
|||
},
|
||||
);
|
||||
self.services.session_telemetry.histogram(
|
||||
"codex.turn.tool.call",
|
||||
TURN_TOOL_CALL_METRIC,
|
||||
i64::try_from(turn_tool_calls).unwrap_or(i64::MAX),
|
||||
&[tmp_mem],
|
||||
);
|
||||
|
|
@ -301,27 +304,27 @@ impl Session {
|
|||
.max(0),
|
||||
};
|
||||
self.services.session_telemetry.histogram(
|
||||
"codex.turn.token_usage",
|
||||
TURN_TOKEN_USAGE_METRIC,
|
||||
turn_token_usage.total_tokens,
|
||||
&[("token_type", "total"), tmp_mem],
|
||||
);
|
||||
self.services.session_telemetry.histogram(
|
||||
"codex.turn.token_usage",
|
||||
TURN_TOKEN_USAGE_METRIC,
|
||||
turn_token_usage.input_tokens,
|
||||
&[("token_type", "input"), tmp_mem],
|
||||
);
|
||||
self.services.session_telemetry.histogram(
|
||||
"codex.turn.token_usage",
|
||||
TURN_TOKEN_USAGE_METRIC,
|
||||
turn_token_usage.cached_input(),
|
||||
&[("token_type", "cached_input"), tmp_mem],
|
||||
);
|
||||
self.services.session_telemetry.histogram(
|
||||
"codex.turn.token_usage",
|
||||
TURN_TOKEN_USAGE_METRIC,
|
||||
turn_token_usage.output_tokens,
|
||||
&[("token_type", "output"), tmp_mem],
|
||||
);
|
||||
self.services.session_telemetry.histogram(
|
||||
"codex.turn.token_usage",
|
||||
TURN_TOKEN_USAGE_METRIC,
|
||||
turn_token_usage.reasoning_output_tokens,
|
||||
&[("token_type", "reasoning_output"), tmp_mem],
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use codex_otel::metrics::names::TURN_TTFM_DURATION_METRIC;
|
||||
use codex_otel::metrics::names::TURN_TTFT_DURATION_METRIC;
|
||||
use codex_protocol::items::TurnItem;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use tokio::sync::Mutex;
|
||||
|
|
@ -9,9 +11,6 @@ use crate::ResponseEvent;
|
|||
use crate::codex::TurnContext;
|
||||
use crate::stream_events_utils::raw_assistant_output_text_from_item;
|
||||
|
||||
const TURN_TTFT_DURATION_METRIC: &str = "codex.turn.ttft.duration_ms";
|
||||
const TURN_TTFM_DURATION_METRIC: &str = "codex.turn.ttfm.duration_ms";
|
||||
|
||||
pub(crate) async fn record_turn_ttft_metric(turn_context: &TurnContext, event: &ResponseEvent) {
|
||||
let Some(duration) = turn_context
|
||||
.turn_timing_state
|
||||
|
|
|
|||
|
|
@ -24,9 +24,8 @@ use crate::metrics::names::WEBSOCKET_EVENT_DURATION_METRIC;
|
|||
use crate::metrics::names::WEBSOCKET_REQUEST_COUNT_METRIC;
|
||||
use crate::metrics::names::WEBSOCKET_REQUEST_DURATION_METRIC;
|
||||
use crate::metrics::runtime_metrics::RuntimeMetricsSummary;
|
||||
use crate::metrics::tags::SessionMetricTagValues;
|
||||
use crate::metrics::timer::Timer;
|
||||
use crate::metrics::validation::validate_tag_key;
|
||||
use crate::metrics::validation::validate_tag_value;
|
||||
use crate::provider::OtelProvider;
|
||||
use crate::sanitize_metric_tag_value;
|
||||
use codex_api::ApiError;
|
||||
|
|
@ -228,40 +227,15 @@ impl SessionTelemetry {
|
|||
if !self.metrics_use_metadata_tags {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut tags = Vec::with_capacity(7);
|
||||
Self::push_metadata_tag(&mut tags, "auth_mode", self.metadata.auth_mode.as_deref())?;
|
||||
Self::push_metadata_tag(
|
||||
&mut tags,
|
||||
"session_source",
|
||||
Some(self.metadata.session_source.as_str()),
|
||||
)?;
|
||||
Self::push_metadata_tag(
|
||||
&mut tags,
|
||||
"originator",
|
||||
Some(self.metadata.originator.as_str()),
|
||||
)?;
|
||||
Self::push_metadata_tag(
|
||||
&mut tags,
|
||||
"service_name",
|
||||
self.metadata.service_name.as_deref(),
|
||||
)?;
|
||||
Self::push_metadata_tag(&mut tags, "model", Some(self.metadata.model.as_str()))?;
|
||||
Self::push_metadata_tag(&mut tags, "app.version", Some(self.metadata.app_version))?;
|
||||
Ok(tags)
|
||||
}
|
||||
|
||||
fn push_metadata_tag<'a>(
|
||||
tags: &mut Vec<(&'a str, &'a str)>,
|
||||
key: &'static str,
|
||||
value: Option<&'a str>,
|
||||
) -> MetricsResult<()> {
|
||||
let Some(value) = value else {
|
||||
return Ok(());
|
||||
};
|
||||
validate_tag_key(key)?;
|
||||
validate_tag_value(value)?;
|
||||
tags.push((key, value));
|
||||
Ok(())
|
||||
SessionMetricTagValues {
|
||||
auth_mode: self.metadata.auth_mode.as_deref(),
|
||||
session_source: self.metadata.session_source.as_str(),
|
||||
originator: self.metadata.originator.as_str(),
|
||||
service_name: self.metadata.service_name.as_deref(),
|
||||
model: self.metadata.model.as_str(),
|
||||
app_version: self.metadata.app_version,
|
||||
}
|
||||
.into_tags()
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
mod client;
|
||||
mod config;
|
||||
mod error;
|
||||
pub(crate) mod names;
|
||||
pub mod names;
|
||||
pub(crate) mod runtime_metrics;
|
||||
pub mod tags;
|
||||
pub(crate) mod timer;
|
||||
pub(crate) mod validation;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,24 +1,27 @@
|
|||
pub(crate) const TOOL_CALL_COUNT_METRIC: &str = "codex.tool.call";
|
||||
pub(crate) const TOOL_CALL_DURATION_METRIC: &str = "codex.tool.call.duration_ms";
|
||||
pub(crate) const API_CALL_COUNT_METRIC: &str = "codex.api_request";
|
||||
pub(crate) const API_CALL_DURATION_METRIC: &str = "codex.api_request.duration_ms";
|
||||
pub(crate) const SSE_EVENT_COUNT_METRIC: &str = "codex.sse_event";
|
||||
pub(crate) const SSE_EVENT_DURATION_METRIC: &str = "codex.sse_event.duration_ms";
|
||||
pub(crate) const WEBSOCKET_REQUEST_COUNT_METRIC: &str = "codex.websocket.request";
|
||||
pub(crate) const WEBSOCKET_REQUEST_DURATION_METRIC: &str = "codex.websocket.request.duration_ms";
|
||||
pub(crate) const WEBSOCKET_EVENT_COUNT_METRIC: &str = "codex.websocket.event";
|
||||
pub(crate) const WEBSOCKET_EVENT_DURATION_METRIC: &str = "codex.websocket.event.duration_ms";
|
||||
pub(crate) const RESPONSES_API_OVERHEAD_DURATION_METRIC: &str =
|
||||
"codex.responses_api_overhead.duration_ms";
|
||||
pub(crate) const RESPONSES_API_INFERENCE_TIME_DURATION_METRIC: &str =
|
||||
pub const TOOL_CALL_COUNT_METRIC: &str = "codex.tool.call";
|
||||
pub const TOOL_CALL_DURATION_METRIC: &str = "codex.tool.call.duration_ms";
|
||||
pub const API_CALL_COUNT_METRIC: &str = "codex.api_request";
|
||||
pub const API_CALL_DURATION_METRIC: &str = "codex.api_request.duration_ms";
|
||||
pub const SSE_EVENT_COUNT_METRIC: &str = "codex.sse_event";
|
||||
pub const SSE_EVENT_DURATION_METRIC: &str = "codex.sse_event.duration_ms";
|
||||
pub const WEBSOCKET_REQUEST_COUNT_METRIC: &str = "codex.websocket.request";
|
||||
pub const WEBSOCKET_REQUEST_DURATION_METRIC: &str = "codex.websocket.request.duration_ms";
|
||||
pub const WEBSOCKET_EVENT_COUNT_METRIC: &str = "codex.websocket.event";
|
||||
pub const WEBSOCKET_EVENT_DURATION_METRIC: &str = "codex.websocket.event.duration_ms";
|
||||
pub const RESPONSES_API_OVERHEAD_DURATION_METRIC: &str = "codex.responses_api_overhead.duration_ms";
|
||||
pub const RESPONSES_API_INFERENCE_TIME_DURATION_METRIC: &str =
|
||||
"codex.responses_api_inference_time.duration_ms";
|
||||
pub(crate) const RESPONSES_API_ENGINE_IAPI_TTFT_DURATION_METRIC: &str =
|
||||
pub const RESPONSES_API_ENGINE_IAPI_TTFT_DURATION_METRIC: &str =
|
||||
"codex.responses_api_engine_iapi_ttft.duration_ms";
|
||||
pub(crate) const RESPONSES_API_ENGINE_SERVICE_TTFT_DURATION_METRIC: &str =
|
||||
pub const RESPONSES_API_ENGINE_SERVICE_TTFT_DURATION_METRIC: &str =
|
||||
"codex.responses_api_engine_service_ttft.duration_ms";
|
||||
pub(crate) const RESPONSES_API_ENGINE_IAPI_TBT_DURATION_METRIC: &str =
|
||||
pub const RESPONSES_API_ENGINE_IAPI_TBT_DURATION_METRIC: &str =
|
||||
"codex.responses_api_engine_iapi_tbt.duration_ms";
|
||||
pub(crate) const RESPONSES_API_ENGINE_SERVICE_TBT_DURATION_METRIC: &str =
|
||||
pub const RESPONSES_API_ENGINE_SERVICE_TBT_DURATION_METRIC: &str =
|
||||
"codex.responses_api_engine_service_tbt.duration_ms";
|
||||
pub(crate) const TURN_TTFT_DURATION_METRIC: &str = "codex.turn.ttft.duration_ms";
|
||||
pub(crate) const TURN_TTFM_DURATION_METRIC: &str = "codex.turn.ttfm.duration_ms";
|
||||
pub const TURN_E2E_DURATION_METRIC: &str = "codex.turn.e2e_duration_ms";
|
||||
pub const TURN_TTFT_DURATION_METRIC: &str = "codex.turn.ttft.duration_ms";
|
||||
pub const TURN_TTFM_DURATION_METRIC: &str = "codex.turn.ttfm.duration_ms";
|
||||
pub const TURN_TOOL_CALL_METRIC: &str = "codex.turn.tool.call";
|
||||
pub const TURN_TOKEN_USAGE_METRIC: &str = "codex.turn.token_usage";
|
||||
pub const THREAD_STARTED_METRIC: &str = "codex.thread.started";
|
||||
|
|
|
|||
108
codex-rs/otel/src/metrics/tags.rs
Normal file
108
codex-rs/otel/src/metrics/tags.rs
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
use crate::metrics::Result;
|
||||
use crate::metrics::validation::validate_tag_key;
|
||||
use crate::metrics::validation::validate_tag_value;
|
||||
|
||||
pub const APP_VERSION_TAG: &str = "app.version";
|
||||
pub const AUTH_MODE_TAG: &str = "auth_mode";
|
||||
pub const MODEL_TAG: &str = "model";
|
||||
pub const ORIGINATOR_TAG: &str = "originator";
|
||||
pub const SERVICE_NAME_TAG: &str = "service_name";
|
||||
pub const SESSION_SOURCE_TAG: &str = "session_source";
|
||||
|
||||
pub struct SessionMetricTagValues<'a> {
|
||||
pub auth_mode: Option<&'a str>,
|
||||
pub session_source: &'a str,
|
||||
pub originator: &'a str,
|
||||
pub service_name: Option<&'a str>,
|
||||
pub model: &'a str,
|
||||
pub app_version: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> SessionMetricTagValues<'a> {
|
||||
pub fn into_tags(self) -> Result<Vec<(&'static str, &'a str)>> {
|
||||
let mut tags = Vec::with_capacity(6);
|
||||
Self::push_optional_tag(&mut tags, AUTH_MODE_TAG, self.auth_mode)?;
|
||||
Self::push_optional_tag(&mut tags, SESSION_SOURCE_TAG, Some(self.session_source))?;
|
||||
Self::push_optional_tag(&mut tags, ORIGINATOR_TAG, Some(self.originator))?;
|
||||
Self::push_optional_tag(&mut tags, SERVICE_NAME_TAG, self.service_name)?;
|
||||
Self::push_optional_tag(&mut tags, MODEL_TAG, Some(self.model))?;
|
||||
Self::push_optional_tag(&mut tags, APP_VERSION_TAG, Some(self.app_version))?;
|
||||
Ok(tags)
|
||||
}
|
||||
|
||||
fn push_optional_tag(
|
||||
tags: &mut Vec<(&'static str, &'a str)>,
|
||||
key: &'static str,
|
||||
value: Option<&'a str>,
|
||||
) -> Result<()> {
|
||||
let Some(value) = value else {
|
||||
return Ok(());
|
||||
};
|
||||
validate_tag_key(key)?;
|
||||
validate_tag_value(value)?;
|
||||
tags.push((key, value));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::APP_VERSION_TAG;
|
||||
use super::AUTH_MODE_TAG;
|
||||
use super::MODEL_TAG;
|
||||
use super::ORIGINATOR_TAG;
|
||||
use super::SERVICE_NAME_TAG;
|
||||
use super::SESSION_SOURCE_TAG;
|
||||
use super::SessionMetricTagValues;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn session_metric_tags_include_expected_tags_in_order() {
|
||||
let tags = SessionMetricTagValues {
|
||||
auth_mode: Some("api_key"),
|
||||
session_source: "cli",
|
||||
originator: "codex_cli",
|
||||
service_name: Some("desktop_app"),
|
||||
model: "gpt-5.1",
|
||||
app_version: "1.2.3",
|
||||
}
|
||||
.into_tags()
|
||||
.expect("tags");
|
||||
|
||||
assert_eq!(
|
||||
tags,
|
||||
vec![
|
||||
(AUTH_MODE_TAG, "api_key"),
|
||||
(SESSION_SOURCE_TAG, "cli"),
|
||||
(ORIGINATOR_TAG, "codex_cli"),
|
||||
(SERVICE_NAME_TAG, "desktop_app"),
|
||||
(MODEL_TAG, "gpt-5.1"),
|
||||
(APP_VERSION_TAG, "1.2.3"),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn session_metric_tags_skip_missing_optional_tags() {
|
||||
let tags = SessionMetricTagValues {
|
||||
auth_mode: None,
|
||||
session_source: "exec",
|
||||
originator: "codex_exec",
|
||||
service_name: None,
|
||||
model: "gpt-5.1",
|
||||
app_version: "1.2.3",
|
||||
}
|
||||
.into_tags()
|
||||
.expect("tags");
|
||||
|
||||
assert_eq!(
|
||||
tags,
|
||||
vec![
|
||||
(SESSION_SOURCE_TAG, "exec"),
|
||||
(ORIGINATOR_TAG, "codex_exec"),
|
||||
(MODEL_TAG, "gpt-5.1"),
|
||||
(APP_VERSION_TAG, "1.2.3"),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue