add sandbox policy and sandbox name to codex.tool.call metrics (#10711)
This will give visibility into the comparative success rate of the Windows sandbox implementations compared to other platforms.
This commit is contained in:
parent
4df9f2020b
commit
901d5b8fd6
4 changed files with 75 additions and 17 deletions
|
|
@ -128,6 +128,17 @@ pub enum SandboxType {
|
|||
WindowsRestrictedToken,
|
||||
}
|
||||
|
||||
impl SandboxType {
|
||||
pub(crate) fn as_metric_tag(self) -> &'static str {
|
||||
match self {
|
||||
SandboxType::None => "none",
|
||||
SandboxType::MacosSeatbelt => "seatbelt",
|
||||
SandboxType::LinuxSeccomp => "seccomp",
|
||||
SandboxType::WindowsRestrictedToken => "windows_sandbox",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StdoutStream {
|
||||
pub sub_id: String,
|
||||
|
|
|
|||
|
|
@ -3,11 +3,15 @@ use std::sync::Arc;
|
|||
use std::time::Duration;
|
||||
|
||||
use crate::client_common::tools::ToolSpec;
|
||||
use crate::exec::SandboxType;
|
||||
use crate::function_tool::FunctionCallError;
|
||||
use crate::protocol::SandboxPolicy;
|
||||
use crate::safety::get_platform_sandbox;
|
||||
use crate::tools::context::ToolInvocation;
|
||||
use crate::tools::context::ToolOutput;
|
||||
use crate::tools::context::ToolPayload;
|
||||
use async_trait::async_trait;
|
||||
use codex_protocol::config_types::WindowsSandboxLevel;
|
||||
use codex_protocol::models::ResponseInputItem;
|
||||
use codex_utils_readiness::Readiness;
|
||||
use tracing::warn;
|
||||
|
|
@ -73,19 +77,33 @@ impl ToolRegistry {
|
|||
let otel = invocation.turn.otel_manager.clone();
|
||||
let payload_for_response = invocation.payload.clone();
|
||||
let log_payload = payload_for_response.log_payload();
|
||||
let metric_tags = [
|
||||
(
|
||||
"sandbox",
|
||||
sandbox_tag(
|
||||
&invocation.turn.sandbox_policy,
|
||||
invocation.turn.windows_sandbox_level,
|
||||
),
|
||||
),
|
||||
(
|
||||
"sandbox_policy",
|
||||
sandbox_policy_tag(&invocation.turn.sandbox_policy),
|
||||
),
|
||||
];
|
||||
|
||||
let handler = match self.handler(tool_name.as_ref()) {
|
||||
Some(handler) => handler,
|
||||
None => {
|
||||
let message =
|
||||
unsupported_tool_call_message(&invocation.payload, tool_name.as_ref());
|
||||
otel.tool_result(
|
||||
otel.tool_result_with_tags(
|
||||
tool_name.as_ref(),
|
||||
&call_id_owned,
|
||||
log_payload.as_ref(),
|
||||
Duration::ZERO,
|
||||
false,
|
||||
&message,
|
||||
&metric_tags,
|
||||
);
|
||||
return Err(FunctionCallError::RespondToModel(message));
|
||||
}
|
||||
|
|
@ -93,13 +111,14 @@ impl ToolRegistry {
|
|||
|
||||
if !handler.matches_kind(&invocation.payload) {
|
||||
let message = format!("tool {tool_name} invoked with incompatible payload");
|
||||
otel.tool_result(
|
||||
otel.tool_result_with_tags(
|
||||
tool_name.as_ref(),
|
||||
&call_id_owned,
|
||||
log_payload.as_ref(),
|
||||
Duration::ZERO,
|
||||
false,
|
||||
&message,
|
||||
&metric_tags,
|
||||
);
|
||||
return Err(FunctionCallError::Fatal(message));
|
||||
}
|
||||
|
|
@ -107,10 +126,11 @@ impl ToolRegistry {
|
|||
let output_cell = tokio::sync::Mutex::new(None);
|
||||
|
||||
let result = otel
|
||||
.log_tool_result(
|
||||
.log_tool_result_with_tags(
|
||||
tool_name.as_ref(),
|
||||
&call_id_owned,
|
||||
log_payload.as_ref(),
|
||||
&metric_tags,
|
||||
|| {
|
||||
let handler = handler.clone();
|
||||
let output_cell = &output_cell;
|
||||
|
|
@ -231,3 +251,29 @@ fn unsupported_tool_call_message(payload: &ToolPayload, tool_name: &str) -> Stri
|
|||
_ => format!("unsupported call: {tool_name}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn sandbox_tag(policy: &SandboxPolicy, windows_sandbox_level: WindowsSandboxLevel) -> &'static str {
|
||||
if matches!(policy, SandboxPolicy::DangerFullAccess) {
|
||||
return "none";
|
||||
}
|
||||
if matches!(policy, SandboxPolicy::ExternalSandbox { .. }) {
|
||||
return "external";
|
||||
}
|
||||
if cfg!(target_os = "windows") && matches!(windows_sandbox_level, WindowsSandboxLevel::Elevated)
|
||||
{
|
||||
return "windows_elevated";
|
||||
}
|
||||
|
||||
get_platform_sandbox(windows_sandbox_level != WindowsSandboxLevel::Disabled)
|
||||
.map(SandboxType::as_metric_tag)
|
||||
.unwrap_or("none")
|
||||
}
|
||||
|
||||
fn sandbox_policy_tag(policy: &SandboxPolicy) -> &'static str {
|
||||
match policy {
|
||||
SandboxPolicy::ReadOnly => "read-only",
|
||||
SandboxPolicy::WorkspaceWrite { .. } => "workspace-write",
|
||||
SandboxPolicy::DangerFullAccess => "danger-full-access",
|
||||
SandboxPolicy::ExternalSandbox { .. } => "external-sandbox",
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -566,11 +566,12 @@ impl OtelManager {
|
|||
);
|
||||
}
|
||||
|
||||
pub async fn log_tool_result<F, Fut, E>(
|
||||
pub async fn log_tool_result_with_tags<F, Fut, E>(
|
||||
&self,
|
||||
tool_name: &str,
|
||||
call_id: &str,
|
||||
arguments: &str,
|
||||
extra_tags: &[(&str, &str)],
|
||||
f: F,
|
||||
) -> Result<(String, bool), E>
|
||||
where
|
||||
|
|
@ -587,13 +588,14 @@ impl OtelManager {
|
|||
Err(error) => (Cow::Owned(error.to_string()), false),
|
||||
};
|
||||
|
||||
self.tool_result(
|
||||
self.tool_result_with_tags(
|
||||
tool_name,
|
||||
call_id,
|
||||
arguments,
|
||||
duration,
|
||||
success,
|
||||
output.as_ref(),
|
||||
extra_tags,
|
||||
);
|
||||
|
||||
result
|
||||
|
|
@ -619,7 +621,8 @@ impl OtelManager {
|
|||
);
|
||||
}
|
||||
|
||||
pub fn tool_result(
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn tool_result_with_tags(
|
||||
&self,
|
||||
tool_name: &str,
|
||||
call_id: &str,
|
||||
|
|
@ -627,18 +630,15 @@ impl OtelManager {
|
|||
duration: Duration,
|
||||
success: bool,
|
||||
output: &str,
|
||||
extra_tags: &[(&str, &str)],
|
||||
) {
|
||||
let success_str = if success { "true" } else { "false" };
|
||||
self.counter(
|
||||
TOOL_CALL_COUNT_METRIC,
|
||||
1,
|
||||
&[("tool", tool_name), ("success", success_str)],
|
||||
);
|
||||
self.record_duration(
|
||||
TOOL_CALL_DURATION_METRIC,
|
||||
duration,
|
||||
&[("tool", tool_name), ("success", success_str)],
|
||||
);
|
||||
let mut tags = Vec::with_capacity(2 + extra_tags.len());
|
||||
tags.push(("tool", tool_name));
|
||||
tags.push(("success", success_str));
|
||||
tags.extend_from_slice(extra_tags);
|
||||
self.counter(TOOL_CALL_COUNT_METRIC, 1, &tags);
|
||||
self.record_duration(TOOL_CALL_DURATION_METRIC, duration, &tags);
|
||||
tracing::event!(
|
||||
tracing::Level::INFO,
|
||||
event.name = "codex.tool_result",
|
||||
|
|
|
|||
|
|
@ -35,13 +35,14 @@ fn runtime_metrics_summary_collects_tool_api_and_streaming_metrics() -> Result<(
|
|||
|
||||
manager.reset_runtime_metrics();
|
||||
|
||||
manager.tool_result(
|
||||
manager.tool_result_with_tags(
|
||||
"shell",
|
||||
"call-1",
|
||||
"{\"cmd\":\"echo\"}",
|
||||
Duration::from_millis(250),
|
||||
true,
|
||||
"ok",
|
||||
&[],
|
||||
);
|
||||
manager.record_api_request(1, Some(200), None, Duration::from_millis(300));
|
||||
manager.record_websocket_request(Duration::from_millis(400), None);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue