Return image URL from view_image tool (#15072)

Cleanup image semantics in code mode.

`view_image` now returns `{image_url:string, details?: string}` 

`image()` now allows both string parameter and `{image_url:string,
details?: string}`
This commit is contained in:
pakrym-oai 2026-03-18 13:58:20 -07:00 committed by GitHub
parent 88e5382fc4
commit 5cada46ddf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 279 additions and 80 deletions

2
codex-rs/Cargo.lock generated
View file

@ -2327,7 +2327,6 @@ dependencies = [
"icu_decimal",
"icu_locale_core",
"icu_provider",
"mime_guess",
"pretty_assertions",
"schemars 0.8.22",
"serde",
@ -2758,6 +2757,7 @@ dependencies = [
"base64 0.22.1",
"codex-utils-cache",
"image",
"mime_guess",
"thiserror 2.0.18",
"tokio",
]

View file

@ -11,7 +11,7 @@
- Global helpers:
- `exit()`: Immediately ends the current script successfully (like an early return from the top level).
- `text(value: string | number | boolean | undefined | null)`: Appends a text item and returns it. Non-string values are stringified with `JSON.stringify(...)` when possible.
- `image(imageUrl: string)`: Appends an image item and returns it. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL.
- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null })`: Appends an image item and returns it. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL.
- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
- `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing.
- `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`.

View file

@ -223,14 +223,48 @@ function codeModeWorkerMain() {
return String(value);
}
function normalizeOutputImageUrl(value) {
if (typeof value !== 'string' || !value) {
throw new TypeError('image expects a non-empty image URL string');
function normalizeOutputImage(value) {
let imageUrl;
let detail;
if (typeof value === 'string') {
imageUrl = value;
} else if (
value &&
typeof value === 'object' &&
!Array.isArray(value)
) {
if (typeof value.image_url === 'string') {
imageUrl = value.image_url;
}
if (typeof value.detail === 'string') {
detail = value.detail;
} else if (
Object.prototype.hasOwnProperty.call(value, 'detail') &&
value.detail !== null &&
typeof value.detail !== 'undefined'
) {
throw new TypeError('image detail must be a string when provided');
}
}
if (/^(?:https?:\/\/|data:)/i.test(value)) {
return value;
if (typeof imageUrl !== 'string' || !imageUrl) {
throw new TypeError(
'image expects a non-empty image URL string or an object with image_url and optional detail'
);
}
throw new TypeError('image expects an http(s) or data URL');
if (!/^(?:https?:\/\/|data:)/i.test(imageUrl)) {
throw new TypeError('image expects an http(s) or data URL');
}
if (typeof detail !== 'undefined' && !/^(?:auto|low|high|original)$/i.test(detail)) {
throw new TypeError('image detail must be one of: auto, low, high, original');
}
const normalized = { image_url: imageUrl };
if (typeof detail === 'string') {
normalized.detail = detail.toLowerCase();
}
return normalized;
}
function createCodeModeHelpers(context, state, toolCallId) {
@ -258,10 +292,7 @@ function codeModeWorkerMain() {
return item;
};
const image = (value) => {
const item = {
type: 'input_image',
image_url: normalizeOutputImageUrl(value),
};
const item = Object.assign({ type: 'input_image' }, normalizeOutputImage(value));
ensureContentItems(context).push(item);
return item;
};

View file

@ -1,20 +1,22 @@
use async_trait::async_trait;
use codex_environment::ExecutorFileSystem;
use codex_protocol::models::ContentItem;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ImageDetail;
use codex_protocol::models::local_image_content_items_with_label_number;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::openai_models::InputModality;
use codex_utils_absolute_path::AbsolutePathBuf;
use codex_utils_image::PromptImageMode;
use codex_utils_image::load_for_prompt_bytes;
use serde::Deserialize;
use crate::function_tool::FunctionCallError;
use crate::original_image_detail::can_request_original_image_detail;
use crate::protocol::EventMsg;
use crate::protocol::ViewImageToolCallEvent;
use crate::tools::context::FunctionToolOutput;
use crate::tools::context::ToolInvocation;
use crate::tools::context::ToolOutput;
use crate::tools::context::ToolPayload;
use crate::tools::handlers::parse_arguments;
use crate::tools::registry::ToolHandler;
@ -38,7 +40,7 @@ enum ViewImageDetail {
#[async_trait]
impl ToolHandler for ViewImageHandler {
type Output = FunctionToolOutput;
type Output = ViewImageOutput;
fn kind(&self) -> ToolKind {
ToolKind::Function
@ -135,22 +137,14 @@ impl ToolHandler for ViewImageHandler {
};
let image_detail = use_original_detail.then_some(ImageDetail::Original);
let content = local_image_content_items_with_label_number(
abs_path.as_path(),
file_bytes,
/*label_number*/ None,
image_mode,
)
.into_iter()
.map(|item| match item {
ContentItem::InputText { text } => FunctionCallOutputContentItem::InputText { text },
ContentItem::InputImage { image_url } => FunctionCallOutputContentItem::InputImage {
image_url,
detail: image_detail,
},
ContentItem::OutputText { text } => FunctionCallOutputContentItem::InputText { text },
})
.collect();
let image =
load_for_prompt_bytes(abs_path.as_path(), file_bytes, image_mode).map_err(|error| {
FunctionCallError::RespondToModel(format!(
"unable to process image at `{}`: {error}",
abs_path.display()
))
})?;
let image_url = image.into_data_url();
session
.send_event(
@ -162,6 +156,75 @@ impl ToolHandler for ViewImageHandler {
)
.await;
Ok(FunctionToolOutput::from_content(content, Some(true)))
Ok(ViewImageOutput {
image_url,
image_detail,
})
}
}
pub struct ViewImageOutput {
image_url: String,
image_detail: Option<ImageDetail>,
}
impl ToolOutput for ViewImageOutput {
fn log_preview(&self) -> String {
self.image_url.clone()
}
fn success_for_logging(&self) -> bool {
true
}
fn to_response_item(&self, call_id: &str, _payload: &ToolPayload) -> ResponseInputItem {
let body =
FunctionCallOutputBody::ContentItems(vec![FunctionCallOutputContentItem::InputImage {
image_url: self.image_url.clone(),
detail: self.image_detail,
}]);
let output = FunctionCallOutputPayload {
body,
success: Some(true),
};
ResponseInputItem::FunctionCallOutput {
call_id: call_id.to_string(),
output,
}
}
fn code_mode_result(&self, _payload: &ToolPayload) -> serde_json::Value {
serde_json::json!({
"image_url": self.image_url,
"detail": self.image_detail
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use serde_json::json;
#[test]
fn code_mode_result_returns_image_url_object() {
let output = ViewImageOutput {
image_url: "data:image/png;base64,AAA".to_string(),
image_detail: None,
};
let result = output.code_mode_result(&ToolPayload::Function {
arguments: "{}".to_string(),
});
assert_eq!(
result,
json!({
"image_url": "data:image/png;base64,AAA",
"detail": null,
})
);
}
}

View file

@ -980,7 +980,21 @@ fn create_view_image_tool(can_request_original_image_detail: bool) -> ToolSpec {
required: Some(vec!["path".to_string()]),
additional_properties: Some(false.into()),
},
output_schema: None,
output_schema: Some(serde_json::json!({
"type": "object",
"properties": {
"image_url": {
"type": "string",
"description": "Data URL for the loaded image."
},
"detail": {
"type": ["string", "null"],
"description": "Image detail hint returned by view_image. Returns `original` when original resolution is preserved, otherwise `null`."
}
},
"required": ["image_url", "detail"],
"additionalProperties": false
})),
})
}

View file

@ -2627,7 +2627,7 @@ fn code_mode_augments_builtin_tool_descriptions_with_typed_sample() {
assert_eq!(
description,
"View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within <image ...> tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise<unknown>; };\n```"
"View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within <image ...> tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise<{ detail: string | null; image_url: string; }>; };\n```"
);
}

View file

@ -1,6 +1,8 @@
#![allow(clippy::expect_used, clippy::unwrap_used)]
use anyhow::Result;
use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_core::config::types::McpServerConfig;
use codex_core::config::types::McpServerTransportConfig;
use codex_core::features::Feature;
@ -1761,6 +1763,90 @@ image("data:image/png;base64,AAA");
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex()
.with_model("gpt-5.3-codex")
.with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
let _ = config.features.enable(Feature::ImageDetailOriginal);
});
let test = builder.build(&server).await?;
let image_bytes = BASE64_STANDARD.decode(
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==",
)?;
let image_path = test.cwd_path().join("code_mode_view_image.png");
fs::write(&image_path, image_bytes)?;
let image_path_json = serde_json::to_string(&image_path.to_string_lossy().to_string())?;
let code = format!(
r#"
const out = await tools.view_image({{ path: {image_path_json}, detail: "original" }});
image(out);
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &code),
ev_completed("resp-1"),
]),
)
.await;
let second_mock = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("use exec to call view_image and emit its image output")
.await?;
let req = second_mock.single_request();
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"code_mode view_image call failed unexpectedly"
);
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert_eq!(
items[1].get("type").and_then(Value::as_str),
Some("input_image")
);
let emitted_image_url = items[1]
.get("image_url")
.and_then(Value::as_str)
.expect("image helper should emit an input_image item with image_url");
assert!(emitted_image_url.starts_with("data:image/png;base64,"));
assert_eq!(
items[1].get("detail").and_then(Value::as_str),
Some("original")
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_apply_patch_via_nested_tool() -> Result<()> {
skip_if_no_network!(Ok(()));
@ -2084,7 +2170,7 @@ text(JSON.stringify(tool));
parsed,
serde_json::json!({
"name": "view_image",
"description": "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within <image ...> tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise<unknown>; };\n```",
"description": "View a local image from the filesystem (only use if given a full filepath by the user, and the image isn't already attached to the thread context within <image ...> tags).\n\nexec tool declaration:\n```ts\ndeclare const tools: { view_image(args: { path: string; }): Promise<{ detail: string | null; image_url: string; }>; };\n```",
})
);

View file

@ -1087,7 +1087,7 @@ async fn view_image_tool_errors_when_path_is_directory() -> anyhow::Result<()> {
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_placeholder_for_non_image_files() -> anyhow::Result<()> {
async fn view_image_tool_errors_for_non_image_files() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
@ -1150,20 +1150,19 @@ async fn view_image_tool_placeholder_for_non_image_files() -> anyhow::Result<()>
request.inputs_of_type("input_image").is_empty(),
"non-image file should not produce an input_image message"
);
let (placeholder, success) = request
let (error_text, success) = request
.function_call_output_content_and_success(call_id)
.expect("function_call_output should be present");
assert_eq!(success, None);
let placeholder = placeholder.expect("placeholder text present");
let error_text = error_text.expect("error text present");
assert!(
placeholder.contains("Codex could not read the local image at")
&& placeholder.contains("unsupported MIME type `application/json`"),
"placeholder should describe the unsupported file type: {placeholder}"
let expected_error = format!(
"unable to process image at `{}`: unsupported image `application/json`",
abs_path.display()
);
assert!(
placeholder.contains(&abs_path.display().to_string()),
"placeholder should mention path: {placeholder}"
error_text.contains(&expected_error),
"error should describe unsupported file type: {error_text}"
);
Ok(())

View file

@ -19,7 +19,6 @@ codex-utils-image = { workspace = true }
icu_decimal = { workspace = true }
icu_locale_core = { workspace = true }
icu_provider = { workspace = true, features = ["sync"] }
mime_guess = { workspace = true }
schemars = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }

View file

@ -941,7 +941,7 @@ fn invalid_image_error_placeholder(
fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> ContentItem {
ContentItem::InputText {
text: format!(
"Codex cannot attach image at `{}`: unsupported image format `{}`.",
"Codex cannot attach image at `{}`: unsupported image `{}`.",
path.display(),
mime
),
@ -972,28 +972,20 @@ pub fn local_image_content_items_with_label_number(
}
items
}
Err(err) => {
if matches!(&err, ImageProcessingError::Read { .. }) {
Err(err) => match &err {
ImageProcessingError::Read { .. } | ImageProcessingError::Encode { .. } => {
vec![local_image_error_placeholder(path, &err)]
} else if err.is_invalid_image() {
vec![invalid_image_error_placeholder(path, &err)]
} else {
let Some(mime_guess) = mime_guess::from_path(path).first() else {
return vec![local_image_error_placeholder(
path,
"unsupported MIME type (unknown)",
)];
};
let mime = mime_guess.essence_str().to_owned();
if !mime.starts_with("image/") {
return vec![local_image_error_placeholder(
path,
format!("unsupported MIME type `{mime}`"),
)];
}
vec![unsupported_image_error_placeholder(path, &mime)]
}
}
ImageProcessingError::Decode { .. } if err.is_invalid_image() => {
vec![invalid_image_error_placeholder(path, &err)]
}
ImageProcessingError::Decode { .. } => {
vec![local_image_error_placeholder(path, &err)]
}
ImageProcessingError::UnsupportedImageFormat { mime } => {
vec![unsupported_image_error_placeholder(path, mime)]
}
},
}
}
@ -2908,8 +2900,8 @@ mod tests {
match &content[0] {
ContentItem::InputText { text } => {
assert!(
text.contains("unsupported MIME type `application/json`"),
"placeholder should mention unsupported MIME: {text}"
text.contains("unsupported image `application/json`"),
"placeholder should mention unsupported image MIME: {text}"
);
assert!(
text.contains(&json_path.display().to_string()),
@ -2943,7 +2935,7 @@ mod tests {
ResponseInputItem::Message { content, .. } => {
assert_eq!(content.len(), 1);
let expected = format!(
"Codex cannot attach image at `{}`: unsupported image format `image/svg+xml`.",
"Codex cannot attach image at `{}`: unsupported image `image/svg+xml`.",
svg_path.display()
);
match &content[0] {

View file

@ -11,6 +11,7 @@ workspace = true
base64 = { workspace = true }
image = { workspace = true, features = ["jpeg", "png", "gif", "webp"] }
codex-utils-cache = { workspace = true }
mime_guess = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true, features = ["fs", "rt", "rt-multi-thread", "macros"] }

View file

@ -23,9 +23,26 @@ pub enum ImageProcessingError {
#[source]
source: image::ImageError,
},
#[error("unsupported image `{mime}`")]
UnsupportedImageFormat { mime: String },
}
impl ImageProcessingError {
pub fn decode_error(path: &std::path::Path, source: image::ImageError) -> Self {
if matches!(source, ImageError::Decoding(_)) {
return ImageProcessingError::Decode {
path: path.to_path_buf(),
source,
};
}
let mime = mime_guess::from_path(path)
.first()
.map(|mime_guess| mime_guess.essence_str().to_owned())
.unwrap_or_else(|| "unknown".to_string());
ImageProcessingError::UnsupportedImageFormat { mime }
}
pub fn is_invalid_image(&self) -> bool {
matches!(
self,

View file

@ -74,12 +74,8 @@ pub fn load_for_prompt_bytes(
_ => None,
};
let dynamic = image::load_from_memory(&file_bytes).map_err(|source| {
ImageProcessingError::Decode {
path: path_buf.clone(),
source,
}
})?;
let dynamic = image::load_from_memory(&file_bytes)
.map_err(|source| ImageProcessingError::decode_error(&path_buf, source))?;
let (width, height) = dynamic.dimensions();
@ -294,10 +290,11 @@ mod tests {
PromptImageMode::ResizeToFit,
)
.expect_err("invalid image should fail");
match err {
ImageProcessingError::Decode { .. } => {}
_ => panic!("unexpected error variant"),
}
assert!(matches!(
err,
ImageProcessingError::Decode { .. }
| ImageProcessingError::UnsupportedImageFormat { .. }
));
}
#[tokio::test(flavor = "multi_thread")]