From 5cada46ddf74701dbaf1a152df0514b918ead70c Mon Sep 17 00:00:00 2001 From: pakrym-oai Date: Wed, 18 Mar 2026 13:58:20 -0700 Subject: [PATCH] Return image URL from view_image tool (#15072) Cleanup image semantics in code mode. `view_image` now returns `{image_url:string, details?: string}` `image()` now allows both string parameter and `{image_url:string, details?: string}` --- codex-rs/Cargo.lock | 2 +- .../core/src/tools/code_mode/description.md | 2 +- codex-rs/core/src/tools/code_mode/runner.cjs | 51 +++++++-- .../core/src/tools/handlers/view_image.rs | 105 ++++++++++++++---- codex-rs/core/src/tools/spec.rs | 16 ++- codex-rs/core/src/tools/spec_tests.rs | 2 +- codex-rs/core/tests/suite/code_mode.rs | 88 ++++++++++++++- codex-rs/core/tests/suite/view_image.rs | 17 ++- codex-rs/protocol/Cargo.toml | 1 - codex-rs/protocol/src/models.rs | 40 +++---- codex-rs/utils/image/Cargo.toml | 1 + codex-rs/utils/image/src/error.rs | 17 +++ codex-rs/utils/image/src/lib.rs | 17 ++- 13 files changed, 279 insertions(+), 80 deletions(-) diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index d6a13a3d4..771b714e0 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -2327,7 +2327,6 @@ dependencies = [ "icu_decimal", "icu_locale_core", "icu_provider", - "mime_guess", "pretty_assertions", "schemars 0.8.22", "serde", @@ -2758,6 +2757,7 @@ dependencies = [ "base64 0.22.1", "codex-utils-cache", "image", + "mime_guess", "thiserror 2.0.18", "tokio", ] diff --git a/codex-rs/core/src/tools/code_mode/description.md b/codex-rs/core/src/tools/code_mode/description.md index 6bf33a184..e0a124c65 100644 --- a/codex-rs/core/src/tools/code_mode/description.md +++ b/codex-rs/core/src/tools/code_mode/description.md @@ -11,7 +11,7 @@ - Global helpers: - `exit()`: Immediately ends the current script successfully (like an early return from the top level). - `text(value: string | number | boolean | undefined | null)`: Appends a text item and returns it. Non-string values are stringified with `JSON.stringify(...)` when possible. -- `image(imageUrl: string)`: Appends an image item and returns it. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. +- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null })`: Appends an image item and returns it. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. - `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session. - `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing. - `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`. diff --git a/codex-rs/core/src/tools/code_mode/runner.cjs b/codex-rs/core/src/tools/code_mode/runner.cjs index 408725555..8b4b322eb 100644 --- a/codex-rs/core/src/tools/code_mode/runner.cjs +++ b/codex-rs/core/src/tools/code_mode/runner.cjs @@ -223,14 +223,48 @@ function codeModeWorkerMain() { return String(value); } - function normalizeOutputImageUrl(value) { - if (typeof value !== 'string' || !value) { - throw new TypeError('image expects a non-empty image URL string'); + function normalizeOutputImage(value) { + let imageUrl; + let detail; + if (typeof value === 'string') { + imageUrl = value; + } else if ( + value && + typeof value === 'object' && + !Array.isArray(value) + ) { + if (typeof value.image_url === 'string') { + imageUrl = value.image_url; + } + if (typeof value.detail === 'string') { + detail = value.detail; + } else if ( + Object.prototype.hasOwnProperty.call(value, 'detail') && + value.detail !== null && + typeof value.detail !== 'undefined' + ) { + throw new TypeError('image detail must be a string when provided'); + } } - if (/^(?:https?:\/\/|data:)/i.test(value)) { - return value; + + if (typeof imageUrl !== 'string' || !imageUrl) { + throw new TypeError( + 'image expects a non-empty image URL string or an object with image_url and optional detail' + ); } - throw new TypeError('image expects an http(s) or data URL'); + if (!/^(?:https?:\/\/|data:)/i.test(imageUrl)) { + throw new TypeError('image expects an http(s) or data URL'); + } + + if (typeof detail !== 'undefined' && !/^(?:auto|low|high|original)$/i.test(detail)) { + throw new TypeError('image detail must be one of: auto, low, high, original'); + } + + const normalized = { image_url: imageUrl }; + if (typeof detail === 'string') { + normalized.detail = detail.toLowerCase(); + } + return normalized; } function createCodeModeHelpers(context, state, toolCallId) { @@ -258,10 +292,7 @@ function codeModeWorkerMain() { return item; }; const image = (value) => { - const item = { - type: 'input_image', - image_url: normalizeOutputImageUrl(value), - }; + const item = Object.assign({ type: 'input_image' }, normalizeOutputImage(value)); ensureContentItems(context).push(item); return item; }; diff --git a/codex-rs/core/src/tools/handlers/view_image.rs b/codex-rs/core/src/tools/handlers/view_image.rs index 3957549d2..5069b1a4b 100644 --- a/codex-rs/core/src/tools/handlers/view_image.rs +++ b/codex-rs/core/src/tools/handlers/view_image.rs @@ -1,20 +1,22 @@ use async_trait::async_trait; use codex_environment::ExecutorFileSystem; -use codex_protocol::models::ContentItem; +use codex_protocol::models::FunctionCallOutputBody; use codex_protocol::models::FunctionCallOutputContentItem; +use codex_protocol::models::FunctionCallOutputPayload; use codex_protocol::models::ImageDetail; -use codex_protocol::models::local_image_content_items_with_label_number; +use codex_protocol::models::ResponseInputItem; use codex_protocol::openai_models::InputModality; use codex_utils_absolute_path::AbsolutePathBuf; use codex_utils_image::PromptImageMode; +use codex_utils_image::load_for_prompt_bytes; use serde::Deserialize; use crate::function_tool::FunctionCallError; use crate::original_image_detail::can_request_original_image_detail; use crate::protocol::EventMsg; use crate::protocol::ViewImageToolCallEvent; -use crate::tools::context::FunctionToolOutput; use crate::tools::context::ToolInvocation; +use crate::tools::context::ToolOutput; use crate::tools::context::ToolPayload; use crate::tools::handlers::parse_arguments; use crate::tools::registry::ToolHandler; @@ -38,7 +40,7 @@ enum ViewImageDetail { #[async_trait] impl ToolHandler for ViewImageHandler { - type Output = FunctionToolOutput; + type Output = ViewImageOutput; fn kind(&self) -> ToolKind { ToolKind::Function @@ -135,22 +137,14 @@ impl ToolHandler for ViewImageHandler { }; let image_detail = use_original_detail.then_some(ImageDetail::Original); - let content = local_image_content_items_with_label_number( - abs_path.as_path(), - file_bytes, - /*label_number*/ None, - image_mode, - ) - .into_iter() - .map(|item| match item { - ContentItem::InputText { text } => FunctionCallOutputContentItem::InputText { text }, - ContentItem::InputImage { image_url } => FunctionCallOutputContentItem::InputImage { - image_url, - detail: image_detail, - }, - ContentItem::OutputText { text } => FunctionCallOutputContentItem::InputText { text }, - }) - .collect(); + let image = + load_for_prompt_bytes(abs_path.as_path(), file_bytes, image_mode).map_err(|error| { + FunctionCallError::RespondToModel(format!( + "unable to process image at `{}`: {error}", + abs_path.display() + )) + })?; + let image_url = image.into_data_url(); session .send_event( @@ -162,6 +156,75 @@ impl ToolHandler for ViewImageHandler { ) .await; - Ok(FunctionToolOutput::from_content(content, Some(true))) + Ok(ViewImageOutput { + image_url, + image_detail, + }) + } +} + +pub struct ViewImageOutput { + image_url: String, + image_detail: Option, +} + +impl ToolOutput for ViewImageOutput { + fn log_preview(&self) -> String { + self.image_url.clone() + } + + fn success_for_logging(&self) -> bool { + true + } + + fn to_response_item(&self, call_id: &str, _payload: &ToolPayload) -> ResponseInputItem { + let body = + FunctionCallOutputBody::ContentItems(vec![FunctionCallOutputContentItem::InputImage { + image_url: self.image_url.clone(), + detail: self.image_detail, + }]); + let output = FunctionCallOutputPayload { + body, + success: Some(true), + }; + + ResponseInputItem::FunctionCallOutput { + call_id: call_id.to_string(), + output, + } + } + + fn code_mode_result(&self, _payload: &ToolPayload) -> serde_json::Value { + serde_json::json!({ + "image_url": self.image_url, + "detail": self.image_detail + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + use serde_json::json; + + #[test] + fn code_mode_result_returns_image_url_object() { + let output = ViewImageOutput { + image_url: "data:image/png;base64,AAA".to_string(), + image_detail: None, + }; + + let result = output.code_mode_result(&ToolPayload::Function { + arguments: "{}".to_string(), + }); + + assert_eq!( + result, + json!({ + "image_url": "data:image/png;base64,AAA", + "detail": null, + }) + ); } } diff --git a/codex-rs/core/src/tools/spec.rs b/codex-rs/core/src/tools/spec.rs index 45fadae7a..032ec608f 100644 --- a/codex-rs/core/src/tools/spec.rs +++ b/codex-rs/core/src/tools/spec.rs @@ -980,7 +980,21 @@ fn create_view_image_tool(can_request_original_image_detail: bool) -> ToolSpec { required: Some(vec!["path".to_string()]), additional_properties: Some(false.into()), }, - output_schema: None, + output_schema: Some(serde_json::json!({ + "type": "object", + "properties": { + "image_url": { + "type": "string", + "description": "Data URL for the loaded image." + }, + "detail": { + "type": ["string", "null"], + "description": "Image detail hint returned by view_image. Returns `original` when original resolution is preserved, otherwise `null`." + } + }, + "required": ["image_url", "detail"], + "additionalProperties": false + })), }) } diff --git a/codex-rs/core/src/tools/spec_tests.rs b/codex-rs/core/src/tools/spec_tests.rs index 37c85e30b..2d0a23431 100644 --- a/codex-rs/core/src/tools/spec_tests.rs +++ b/codex-rs/core/src/tools/spec_tests.rs @@ -2627,7 +2627,7 @@ fn code_mode_augments_builtin_tool_descriptions_with_typed_sample() { assert_eq!( description, - "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise; };\n```" + "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise<{ detail: string | null; image_url: string; }>; };\n```" ); } diff --git a/codex-rs/core/tests/suite/code_mode.rs b/codex-rs/core/tests/suite/code_mode.rs index 2a7652691..62d13e649 100644 --- a/codex-rs/core/tests/suite/code_mode.rs +++ b/codex-rs/core/tests/suite/code_mode.rs @@ -1,6 +1,8 @@ #![allow(clippy::expect_used, clippy::unwrap_used)] use anyhow::Result; +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; use codex_core::config::types::McpServerConfig; use codex_core::config::types::McpServerTransportConfig; use codex_core::features::Feature; @@ -1761,6 +1763,90 @@ image("data:image/png;base64,AAA"); Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + let mut builder = test_codex() + .with_model("gpt-5.3-codex") + .with_config(move |config| { + let _ = config.features.enable(Feature::CodeMode); + let _ = config.features.enable(Feature::ImageDetailOriginal); + }); + let test = builder.build(&server).await?; + + let image_bytes = BASE64_STANDARD.decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==", + )?; + let image_path = test.cwd_path().join("code_mode_view_image.png"); + fs::write(&image_path, image_bytes)?; + + let image_path_json = serde_json::to_string(&image_path.to_string_lossy().to_string())?; + let code = format!( + r#" +const out = await tools.view_image({{ path: {image_path_json}, detail: "original" }}); +image(out); +"# + ); + + responses::mount_sse_once( + &server, + sse(vec![ + ev_response_created("resp-1"), + ev_custom_tool_call("call-1", "exec", &code), + ev_completed("resp-1"), + ]), + ) + .await; + + let second_mock = responses::mount_sse_once( + &server, + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ) + .await; + + test.submit_turn("use exec to call view_image and emit its image output") + .await?; + + let req = second_mock.single_request(); + let items = custom_tool_output_items(&req, "call-1"); + let (_, success) = custom_tool_output_body_and_success(&req, "call-1"); + assert_ne!( + success, + Some(false), + "code_mode view_image call failed unexpectedly" + ); + assert_eq!(items.len(), 2); + assert_regex_match( + concat!( + r"(?s)\A", + r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z" + ), + text_item(&items, 0), + ); + + assert_eq!( + items[1].get("type").and_then(Value::as_str), + Some("input_image") + ); + + let emitted_image_url = items[1] + .get("image_url") + .and_then(Value::as_str) + .expect("image helper should emit an input_image item with image_url"); + assert!(emitted_image_url.starts_with("data:image/png;base64,")); + assert_eq!( + items[1].get("detail").and_then(Value::as_str), + Some("original") + ); + + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn code_mode_can_apply_patch_via_nested_tool() -> Result<()> { skip_if_no_network!(Ok(())); @@ -2084,7 +2170,7 @@ text(JSON.stringify(tool)); parsed, serde_json::json!({ "name": "view_image", - "description": "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise; };\n```", + "description": "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise<{ detail: string | null; image_url: string; }>; };\n```", }) ); diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs index 240620a27..8f1d0a5fe 100644 --- a/codex-rs/core/tests/suite/view_image.rs +++ b/codex-rs/core/tests/suite/view_image.rs @@ -1087,7 +1087,7 @@ async fn view_image_tool_errors_when_path_is_directory() -> anyhow::Result<()> { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn view_image_tool_placeholder_for_non_image_files() -> anyhow::Result<()> { +async fn view_image_tool_errors_for_non_image_files() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); let server = start_mock_server().await; @@ -1150,20 +1150,19 @@ async fn view_image_tool_placeholder_for_non_image_files() -> anyhow::Result<()> request.inputs_of_type("input_image").is_empty(), "non-image file should not produce an input_image message" ); - let (placeholder, success) = request + let (error_text, success) = request .function_call_output_content_and_success(call_id) .expect("function_call_output should be present"); assert_eq!(success, None); - let placeholder = placeholder.expect("placeholder text present"); + let error_text = error_text.expect("error text present"); - assert!( - placeholder.contains("Codex could not read the local image at") - && placeholder.contains("unsupported MIME type `application/json`"), - "placeholder should describe the unsupported file type: {placeholder}" + let expected_error = format!( + "unable to process image at `{}`: unsupported image `application/json`", + abs_path.display() ); assert!( - placeholder.contains(&abs_path.display().to_string()), - "placeholder should mention path: {placeholder}" + error_text.contains(&expected_error), + "error should describe unsupported file type: {error_text}" ); Ok(()) diff --git a/codex-rs/protocol/Cargo.toml b/codex-rs/protocol/Cargo.toml index 02db92d09..b0f199466 100644 --- a/codex-rs/protocol/Cargo.toml +++ b/codex-rs/protocol/Cargo.toml @@ -19,7 +19,6 @@ codex-utils-image = { workspace = true } icu_decimal = { workspace = true } icu_locale_core = { workspace = true } icu_provider = { workspace = true, features = ["sync"] } -mime_guess = { workspace = true } schemars = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs index 79e3991ea..1368a93f6 100644 --- a/codex-rs/protocol/src/models.rs +++ b/codex-rs/protocol/src/models.rs @@ -941,7 +941,7 @@ fn invalid_image_error_placeholder( fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> ContentItem { ContentItem::InputText { text: format!( - "Codex cannot attach image at `{}`: unsupported image format `{}`.", + "Codex cannot attach image at `{}`: unsupported image `{}`.", path.display(), mime ), @@ -972,28 +972,20 @@ pub fn local_image_content_items_with_label_number( } items } - Err(err) => { - if matches!(&err, ImageProcessingError::Read { .. }) { + Err(err) => match &err { + ImageProcessingError::Read { .. } | ImageProcessingError::Encode { .. } => { vec![local_image_error_placeholder(path, &err)] - } else if err.is_invalid_image() { - vec![invalid_image_error_placeholder(path, &err)] - } else { - let Some(mime_guess) = mime_guess::from_path(path).first() else { - return vec![local_image_error_placeholder( - path, - "unsupported MIME type (unknown)", - )]; - }; - let mime = mime_guess.essence_str().to_owned(); - if !mime.starts_with("image/") { - return vec![local_image_error_placeholder( - path, - format!("unsupported MIME type `{mime}`"), - )]; - } - vec![unsupported_image_error_placeholder(path, &mime)] } - } + ImageProcessingError::Decode { .. } if err.is_invalid_image() => { + vec![invalid_image_error_placeholder(path, &err)] + } + ImageProcessingError::Decode { .. } => { + vec![local_image_error_placeholder(path, &err)] + } + ImageProcessingError::UnsupportedImageFormat { mime } => { + vec![unsupported_image_error_placeholder(path, mime)] + } + }, } } @@ -2908,8 +2900,8 @@ mod tests { match &content[0] { ContentItem::InputText { text } => { assert!( - text.contains("unsupported MIME type `application/json`"), - "placeholder should mention unsupported MIME: {text}" + text.contains("unsupported image `application/json`"), + "placeholder should mention unsupported image MIME: {text}" ); assert!( text.contains(&json_path.display().to_string()), @@ -2943,7 +2935,7 @@ mod tests { ResponseInputItem::Message { content, .. } => { assert_eq!(content.len(), 1); let expected = format!( - "Codex cannot attach image at `{}`: unsupported image format `image/svg+xml`.", + "Codex cannot attach image at `{}`: unsupported image `image/svg+xml`.", svg_path.display() ); match &content[0] { diff --git a/codex-rs/utils/image/Cargo.toml b/codex-rs/utils/image/Cargo.toml index e835e49e7..9fcd3166b 100644 --- a/codex-rs/utils/image/Cargo.toml +++ b/codex-rs/utils/image/Cargo.toml @@ -11,6 +11,7 @@ workspace = true base64 = { workspace = true } image = { workspace = true, features = ["jpeg", "png", "gif", "webp"] } codex-utils-cache = { workspace = true } +mime_guess = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true, features = ["fs", "rt", "rt-multi-thread", "macros"] } diff --git a/codex-rs/utils/image/src/error.rs b/codex-rs/utils/image/src/error.rs index 6bd055115..28b73f4a7 100644 --- a/codex-rs/utils/image/src/error.rs +++ b/codex-rs/utils/image/src/error.rs @@ -23,9 +23,26 @@ pub enum ImageProcessingError { #[source] source: image::ImageError, }, + #[error("unsupported image `{mime}`")] + UnsupportedImageFormat { mime: String }, } impl ImageProcessingError { + pub fn decode_error(path: &std::path::Path, source: image::ImageError) -> Self { + if matches!(source, ImageError::Decoding(_)) { + return ImageProcessingError::Decode { + path: path.to_path_buf(), + source, + }; + } + + let mime = mime_guess::from_path(path) + .first() + .map(|mime_guess| mime_guess.essence_str().to_owned()) + .unwrap_or_else(|| "unknown".to_string()); + ImageProcessingError::UnsupportedImageFormat { mime } + } + pub fn is_invalid_image(&self) -> bool { matches!( self, diff --git a/codex-rs/utils/image/src/lib.rs b/codex-rs/utils/image/src/lib.rs index 8fd072426..b150f76a1 100644 --- a/codex-rs/utils/image/src/lib.rs +++ b/codex-rs/utils/image/src/lib.rs @@ -74,12 +74,8 @@ pub fn load_for_prompt_bytes( _ => None, }; - let dynamic = image::load_from_memory(&file_bytes).map_err(|source| { - ImageProcessingError::Decode { - path: path_buf.clone(), - source, - } - })?; + let dynamic = image::load_from_memory(&file_bytes) + .map_err(|source| ImageProcessingError::decode_error(&path_buf, source))?; let (width, height) = dynamic.dimensions(); @@ -294,10 +290,11 @@ mod tests { PromptImageMode::ResizeToFit, ) .expect_err("invalid image should fail"); - match err { - ImageProcessingError::Decode { .. } => {} - _ => panic!("unexpected error variant"), - } + assert!(matches!( + err, + ImageProcessingError::Decode { .. } + | ImageProcessingError::UnsupportedImageFormat { .. } + )); } #[tokio::test(flavor = "multi_thread")]