From c4cb594e73fda4b9247d606fb00eb2c58da26647 Mon Sep 17 00:00:00 2001 From: Curtis 'Fjord' Hawthorne Date: Tue, 3 Mar 2026 16:25:59 -0800 Subject: [PATCH] Make js_repl image output controllable (#13331) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Instead of always adding inner function call outputs to the model context, let js code decide which ones to return. - Stop auto-hoisting nested tool outputs from `codex.tool(...)` into the outer `js_repl` function output. - Keep `codex.tool(...)` return values unchanged as structured JS objects. - Add `codex.emitImage(...)` as the explicit path for attaching an image to the outer `js_repl` function output. - Support emitting from a direct image URL, a single `input_image` item, an explicit `{ bytes, mimeType }` object, or a raw tool response object containing exactly one image. - Preserve existing `view_image` original-resolution behavior when JS emits the raw `view_image` tool result. - Suppress the special `ViewImageToolCall` event for `js_repl`-sourced `view_image` calls so nested inspection stays side-effect free until JS explicitly emits. - Update the `js_repl` docs and generated project instructions with both recommended patterns: - `await codex.emitImage(codex.tool("view_image", { path }))` - `await codex.emitImage({ bytes: await page.screenshot({ type: "jpeg", quality: 85 }), mimeType: "image/jpeg" })` #### [git stack](https://github.com/magus/git-stack-cli) - ✅ `1` https://github.com/openai/codex/pull/13050 - 👉 `2` https://github.com/openai/codex/pull/13331 - ⏳ `3` https://github.com/openai/codex/pull/13049 --- codex-rs/core/src/project_doc.rs | 18 +- codex-rs/core/src/tools/js_repl/kernel.js | 317 ++++++++++- codex-rs/core/src/tools/js_repl/mod.rs | 614 +++++++++++++++++++--- codex-rs/core/tests/suite/client.rs | 123 +++++ codex-rs/core/tests/suite/view_image.rs | 122 ++++- docs/js_repl.md | 7 +- 6 files changed, 1124 insertions(+), 77 deletions(-) diff --git a/codex-rs/core/src/project_doc.rs b/codex-rs/core/src/project_doc.rs index 6370b1544..debe39c2a 100644 --- a/codex-rs/core/src/project_doc.rs +++ b/codex-rs/core/src/project_doc.rs @@ -52,9 +52,13 @@ fn render_js_repl_instructions(config: &Config) -> Option { "- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n", ); section.push_str("- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n"); - section.push_str("- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n"); - section.push_str("- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n"); - section.push_str("- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n"); + section.push_str( + "- Helpers: `codex.tmpDir`, `codex.tool(name, args?)`, and `codex.emitImage(imageLike)`.\n", + ); + section.push_str("- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike. Nested tool outputs stay inside JavaScript unless you emit them explicitly.\n"); + section.push_str("- `codex.emitImage(...)` adds exactly one image to the outer `js_repl` function output. It accepts a direct image URL, a single `input_image` item, an object like `{ bytes, mimeType }`, or a raw tool response object with exactly one image and no text. It rejects mixed text-and-image content.\n"); + section.push_str("- Example of sharing an in-memory Playwright screenshot: `await codex.emitImage({ bytes: await page.screenshot({ type: \"jpeg\", quality: 85 }), mimeType: \"image/jpeg\" })`.\n"); + section.push_str("- Example of sharing a local image tool result: `await codex.emitImage(codex.tool(\"view_image\", { path: \"/absolute/path\" }))`.\n"); if config.features.enabled(Feature::ImageDetailOriginal) { section.push_str("- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n"); } @@ -67,7 +71,7 @@ fn render_js_repl_instructions(config: &Config) -> Option { .push_str("- MCP tools (if any) can also be called by name via `codex.tool(...)`.\n"); } - section.push_str("- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log` and `codex.tool(...)`."); + section.push_str("- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log`, `codex.tool(...)`, and `codex.emitImage(...)`."); Some(section) } @@ -476,7 +480,7 @@ mod tests { let res = get_user_instructions(&cfg, None) .await .expect("js_repl instructions expected"); - let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log` and `codex.tool(...)`."; + let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir`, `codex.tool(name, args?)`, and `codex.emitImage(imageLike)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike. Nested tool outputs stay inside JavaScript unless you emit them explicitly.\n- `codex.emitImage(...)` adds exactly one image to the outer `js_repl` function output. It accepts a direct image URL, a single `input_image` item, an object like `{ bytes, mimeType }`, or a raw tool response object with exactly one image and no text. It rejects mixed text-and-image content.\n- Example of sharing an in-memory Playwright screenshot: `await codex.emitImage({ bytes: await page.screenshot({ type: \"jpeg\", quality: 85 }), mimeType: \"image/jpeg\" })`.\n- Example of sharing a local image tool result: `await codex.emitImage(codex.tool(\"view_image\", { path: \"/absolute/path\" }))`.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log`, `codex.tool(...)`, and `codex.emitImage(...)`."; assert_eq!(res, expected); } @@ -491,7 +495,7 @@ mod tests { let res = get_user_instructions(&cfg, None) .await .expect("js_repl instructions expected"); - let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Do not call tools directly; use `js_repl` + `codex.tool(...)` for all tool calls, including shell commands.\n- MCP tools (if any) can also be called by name via `codex.tool(...)`.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log` and `codex.tool(...)`."; + let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir`, `codex.tool(name, args?)`, and `codex.emitImage(imageLike)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike. Nested tool outputs stay inside JavaScript unless you emit them explicitly.\n- `codex.emitImage(...)` adds exactly one image to the outer `js_repl` function output. It accepts a direct image URL, a single `input_image` item, an object like `{ bytes, mimeType }`, or a raw tool response object with exactly one image and no text. It rejects mixed text-and-image content.\n- Example of sharing an in-memory Playwright screenshot: `await codex.emitImage({ bytes: await page.screenshot({ type: \"jpeg\", quality: 85 }), mimeType: \"image/jpeg\" })`.\n- Example of sharing a local image tool result: `await codex.emitImage(codex.tool(\"view_image\", { path: \"/absolute/path\" }))`.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Do not call tools directly; use `js_repl` + `codex.tool(...)` for all tool calls, including shell commands.\n- MCP tools (if any) can also be called by name via `codex.tool(...)`.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log`, `codex.tool(...)`, and `codex.emitImage(...)`."; assert_eq!(res, expected); } @@ -506,7 +510,7 @@ mod tests { let res = get_user_instructions(&cfg, None) .await .expect("js_repl instructions expected"); - let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir` and `codex.tool(name, args?)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike.\n- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool(\"view_image\", { path: \"/absolute/path\" })`, then delete the file.\n- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log` and `codex.tool(...)`."; + let expected = "## JavaScript REPL (Node)\n- Use `js_repl` for Node-backed JavaScript with top-level await in a persistent kernel.\n- `js_repl` is a freeform/custom tool. Direct `js_repl` calls must send raw JavaScript tool input (optionally with first-line `// codex-js-repl: timeout_ms=15000`). Do not wrap code in JSON (for example `{\"code\":\"...\"}`), quotes, or markdown code fences.\n- Helpers: `codex.tmpDir`, `codex.tool(name, args?)`, and `codex.emitImage(imageLike)`.\n- `codex.tool` executes a normal tool call and resolves to the raw tool output object. Use it for shell and non-shell tools alike. Nested tool outputs stay inside JavaScript unless you emit them explicitly.\n- `codex.emitImage(...)` adds exactly one image to the outer `js_repl` function output. It accepts a direct image URL, a single `input_image` item, an object like `{ bytes, mimeType }`, or a raw tool response object with exactly one image and no text. It rejects mixed text-and-image content.\n- Example of sharing an in-memory Playwright screenshot: `await codex.emitImage({ bytes: await page.screenshot({ type: \"jpeg\", quality: 85 }), mimeType: \"image/jpeg\" })`.\n- Example of sharing a local image tool result: `await codex.emitImage(codex.tool(\"view_image\", { path: \"/absolute/path\" }))`.\n- When generating or converting images for `view_image` in `js_repl`, prefer JPEG at 85% quality unless lossless quality is strictly required; other formats can be used if the user requests them. This keeps uploads smaller and reduces the chance of hitting image size caps.\n- Top-level bindings persist across cells. If you hit `SyntaxError: Identifier 'x' has already been declared`, reuse the binding, pick a new name, wrap in `{ ... }` for block scope, or reset the kernel with `js_repl_reset`.\n- Top-level static import declarations (for example `import x from \"pkg\"`) are currently unsupported in `js_repl`; use dynamic imports with `await import(\"pkg\")` instead.\n- Avoid direct access to `process.stdout` / `process.stderr` / `process.stdin`; it can corrupt the JSON line protocol. Use `console.log`, `codex.tool(...)`, and `codex.emitImage(...)`."; assert_eq!(res, expected); } diff --git a/codex-rs/core/src/tools/js_repl/kernel.js b/codex-rs/core/src/tools/js_repl/kernel.js index f5459fde7..038783fb2 100644 --- a/codex-rs/core/src/tools/js_repl/kernel.js +++ b/codex-rs/core/src/tools/js_repl/kernel.js @@ -115,7 +115,10 @@ function isDeniedBuiltin(specifier) { /** @type {Map void>} */ const pendingTool = new Map(); +/** @type {Map void>} */ +const pendingEmitImage = new Map(); let toolCounter = 0; +let emitImageCounter = 0; const tmpDir = process.env.CODEX_JS_TMP_DIR || process.cwd(); const nodeModuleDirEnv = process.env.CODEX_JS_REPL_NODE_MODULE_DIRS ?? ""; const moduleSearchBases = (() => { @@ -480,8 +483,255 @@ function withCapturedConsole(ctx, fn) { }); } +function isPlainObject(value) { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function toByteArray(value) { + if (value instanceof Uint8Array) { + return value; + } + if (value instanceof ArrayBuffer) { + return new Uint8Array(value); + } + if (ArrayBuffer.isView(value)) { + return new Uint8Array(value.buffer, value.byteOffset, value.byteLength); + } + return null; +} + +function encodeByteImage(bytes, mimeType, detail) { + if (bytes.byteLength === 0) { + throw new Error("codex.emitImage expected non-empty bytes"); + } + if (typeof mimeType !== "string" || !mimeType) { + throw new Error("codex.emitImage expected a non-empty mimeType"); + } + const image_url = `data:${mimeType};base64,${Buffer.from(bytes).toString("base64")}`; + return { image_url, detail }; +} + +function parseImageDetail(detail) { + if (typeof detail === "undefined") { + return undefined; + } + if (typeof detail !== "string" || !detail) { + throw new Error("codex.emitImage expected detail to be a non-empty string"); + } + if ( + detail !== "auto" && + detail !== "low" && + detail !== "high" && + detail !== "original" + ) { + throw new Error( + 'codex.emitImage expected detail to be one of "auto", "low", "high", or "original"', + ); + } + return detail; +} + +function parseInputImageItem(value) { + if (!isPlainObject(value) || value.type !== "input_image") { + return null; + } + if (typeof value.image_url !== "string" || !value.image_url) { + throw new Error("codex.emitImage expected a non-empty image_url"); + } + return { + images: [{ image_url: value.image_url, detail: parseImageDetail(value.detail) }], + textCount: 0, + }; +} + +function parseContentItems(items) { + if (!Array.isArray(items)) { + return null; + } + + const images = []; + let textCount = 0; + for (const item of items) { + if (!isPlainObject(item) || typeof item.type !== "string") { + throw new Error("codex.emitImage received malformed content items"); + } + if (item.type === "input_image") { + if (typeof item.image_url !== "string" || !item.image_url) { + throw new Error("codex.emitImage expected a non-empty image_url"); + } + images.push({ + image_url: item.image_url, + detail: parseImageDetail(item.detail), + }); + continue; + } + if (item.type === "input_text" || item.type === "output_text") { + textCount += 1; + continue; + } + throw new Error( + `codex.emitImage does not support content item type "${item.type}"`, + ); + } + + return { images, textCount }; +} + +function parseByteImageValue(value) { + if (!isPlainObject(value) || !("bytes" in value)) { + return null; + } + const bytes = toByteArray(value.bytes); + if (!bytes) { + throw new Error( + "codex.emitImage expected bytes to be Buffer, Uint8Array, ArrayBuffer, or ArrayBufferView", + ); + } + const detail = parseImageDetail(value.detail); + return encodeByteImage(bytes, value.mimeType, detail); +} + +function parseToolOutput(output) { + if (typeof output === "string") { + return { + images: [], + textCount: output.length > 0 ? 1 : 0, + }; + } + + const parsedItems = parseContentItems(output); + if (parsedItems) { + return parsedItems; + } + + throw new Error("codex.emitImage received an unsupported tool output shape"); +} + +function normalizeMcpImageData(data, mimeType) { + if (typeof data !== "string" || !data) { + throw new Error("codex.emitImage expected MCP image data"); + } + if (data.startsWith("data:")) { + return data; + } + const normalizedMimeType = + typeof mimeType === "string" && mimeType ? mimeType : "application/octet-stream"; + return `data:${normalizedMimeType};base64,${data}`; +} + +function parseMcpToolResult(result) { + if (typeof result === "string") { + return { images: [], textCount: result.length > 0 ? 1 : 0 }; + } + + if (!isPlainObject(result)) { + throw new Error("codex.emitImage received an unsupported MCP result"); + } + + if ("Err" in result) { + const error = result.Err; + return { images: [], textCount: typeof error === "string" && error ? 1 : 0 }; + } + + if (!("Ok" in result)) { + throw new Error("codex.emitImage received an unsupported MCP result"); + } + + const ok = result.Ok; + if (!isPlainObject(ok) || !Array.isArray(ok.content)) { + throw new Error("codex.emitImage received malformed MCP content"); + } + + const images = []; + let textCount = 0; + for (const item of ok.content) { + if (!isPlainObject(item) || typeof item.type !== "string") { + throw new Error("codex.emitImage received malformed MCP content"); + } + if (item.type === "image") { + images.push({ + image_url: normalizeMcpImageData(item.data, item.mimeType ?? item.mime_type), + }); + continue; + } + if (item.type === "text") { + textCount += 1; + continue; + } + throw new Error( + `codex.emitImage does not support MCP content type "${item.type}"`, + ); + } + + return { images, textCount }; +} + +function requireSingleImage(parsed) { + if (parsed.textCount > 0) { + throw new Error("codex.emitImage does not accept mixed text and image content"); + } + if (parsed.images.length !== 1) { + throw new Error("codex.emitImage expected exactly one image"); + } + return parsed.images[0]; +} + +function normalizeEmitImageValue(value) { + if (typeof value === "string") { + if (!value) { + throw new Error("codex.emitImage expected a non-empty image URL"); + } + return { image_url: value }; + } + + const directItem = parseInputImageItem(value); + if (directItem) { + return requireSingleImage(directItem); + } + + const byteImage = parseByteImageValue(value); + if (byteImage) { + return byteImage; + } + + const directItems = parseContentItems(value); + if (directItems) { + return requireSingleImage(directItems); + } + + if (!isPlainObject(value)) { + throw new Error("codex.emitImage received an unsupported value"); + } + + if (value.type === "message") { + return requireSingleImage(parseContentItems(value.content)); + } + + if ( + value.type === "function_call_output" || + value.type === "custom_tool_call_output" + ) { + return requireSingleImage(parseToolOutput(value.output)); + } + + if (value.type === "mcp_tool_call_output") { + return requireSingleImage(parseMcpToolResult(value.result)); + } + + if ("output" in value) { + return requireSingleImage(parseToolOutput(value.output)); + } + + if ("content" in value) { + return requireSingleImage(parseContentItems(value.content)); + } + + throw new Error("codex.emitImage received an unsupported value"); +} + async function handleExec(message) { activeExecId = message.id; + const pendingBackgroundTasks = new Set(); const tool = (toolName, args) => { if (typeof toolName !== "string" || !toolName) { return Promise.reject(new Error("codex.tool expects a tool name string")); @@ -512,13 +762,57 @@ async function handleExec(message) { }); }); }; + const emitImage = (imageLike) => { + const operation = (async () => { + const normalized = normalizeEmitImageValue(await imageLike); + const id = `${message.id}-emit-image-${emitImageCounter++}`; + const payload = { + type: "emit_image", + id, + exec_id: message.id, + image_url: normalized.image_url, + detail: normalized.detail ?? null, + }; + send(payload); + return new Promise((resolve, reject) => { + pendingEmitImage.set(id, (res) => { + if (!res.ok) { + reject(new Error(res.error || "emitImage failed")); + return; + } + resolve(); + }); + }); + })(); + + const observation = { observed: false }; + const trackedOperation = operation.then( + () => ({ ok: true, error: null, observation }), + (error) => ({ ok: false, error, observation }), + ); + pendingBackgroundTasks.add(trackedOperation); + return { + then(onFulfilled, onRejected) { + observation.observed = true; + return operation.then(onFulfilled, onRejected); + }, + catch(onRejected) { + observation.observed = true; + return operation.catch(onRejected); + }, + finally(onFinally) { + observation.observed = true; + return operation.finally(onFinally); + }, + }; + }; try { const code = typeof message.code === "string" ? message.code : ""; const { source, nextBindings } = await buildModuleSource(code); let output = ""; - context.codex = { tmpDir, tool }; + context.codex = { tmpDir, tool, emitImage }; context.tmpDir = tmpDir; await withCapturedConsole(context, async (logs) => { @@ -558,6 +852,15 @@ async function handleExec(message) { }); await module.evaluate(); + if (pendingBackgroundTasks.size > 0) { + const backgroundResults = await Promise.all([...pendingBackgroundTasks]); + const firstUnhandledBackgroundError = backgroundResults.find( + (result) => !result.ok && !result.observation.observed, + ); + if (firstUnhandledBackgroundError) { + throw firstUnhandledBackgroundError.error; + } + } previousModule = module; previousBindings = nextBindings; output = logs.join("\n"); @@ -593,6 +896,14 @@ function handleToolResult(message) { } } +function handleEmitImageResult(message) { + const resolver = pendingEmitImage.get(message.id); + if (resolver) { + pendingEmitImage.delete(message.id); + resolver(message); + } +} + let queue = Promise.resolve(); process.on("uncaughtException", (error) => { @@ -622,5 +933,9 @@ input.on("line", (line) => { } if (message.type === "run_tool_result") { handleToolResult(message); + return; + } + if (message.type === "emit_image_result") { + handleEmitImageResult(message); } }); diff --git a/codex-rs/core/src/tools/js_repl/mod.rs b/codex-rs/core/src/tools/js_repl/mod.rs index ea9f42ce6..a8a912307 100644 --- a/codex-rs/core/src/tools/js_repl/mod.rs +++ b/codex-rs/core/src/tools/js_repl/mod.rs @@ -12,6 +12,7 @@ use codex_protocol::ThreadId; use codex_protocol::models::ContentItem; use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ImageDetail; use codex_protocol::models::ResponseInputItem; use serde::Deserialize; use serde::Serialize; @@ -35,6 +36,7 @@ use crate::codex::Session; use crate::codex::TurnContext; use crate::exec::ExecExpiration; use crate::exec_env::create_env; +use crate::features::Feature; use crate::function_tool::FunctionCallError; use crate::sandboxing::CommandSpec; use crate::sandboxing::SandboxManager; @@ -372,18 +374,14 @@ impl JsReplManager { Some(state.cancel.clone()) } - async fn record_exec_tool_call_content_items( + async fn record_exec_content_item( exec_tool_calls: &Arc>>, exec_id: &str, - content_items: Vec, + content_item: FunctionCallOutputContentItem, ) { - if content_items.is_empty() { - return; - } - let mut calls = exec_tool_calls.lock().await; if let Some(state) = calls.get_mut(exec_id) { - state.content_items.extend(content_items); + state.content_items.push(content_item); } } @@ -1131,6 +1129,49 @@ impl JsReplManager { exec_contexts.lock().await.remove(&id); JsReplManager::clear_exec_tool_calls_map(&exec_tool_calls, &id).await; } + KernelToHost::EmitImage(req) => { + let exec_id = req.exec_id.clone(); + let emit_id = req.id.clone(); + let response = + if let Some(ctx) = exec_contexts.lock().await.get(&exec_id).cloned() { + let content_item = emitted_image_content_item( + ctx.turn.as_ref(), + req.image_url, + req.detail, + ); + JsReplManager::record_exec_content_item( + &exec_tool_calls, + &exec_id, + content_item, + ) + .await; + HostToKernel::EmitImageResult(EmitImageResult { + id: emit_id, + ok: true, + error: None, + }) + } else { + HostToKernel::EmitImageResult(EmitImageResult { + id: emit_id, + ok: false, + error: Some("js_repl exec context not found".to_string()), + }) + }; + + if let Err(err) = JsReplManager::write_message(&stdin, &response).await { + let snapshot = + JsReplManager::kernel_debug_snapshot(&child, &recent_stderr).await; + warn!( + exec_id = %exec_id, + emit_id = %req.id, + error = %err, + kernel_pid = ?snapshot.pid, + kernel_status = %snapshot.status, + kernel_stderr_tail = %snapshot.stderr_tail, + "failed to reply to kernel emit_image request" + ); + } + } KernelToHost::RunTool(req) => { let Some(reset_cancel) = JsReplManager::begin_exec_tool_call(&exec_tool_calls, &req.exec_id).await @@ -1176,11 +1217,7 @@ impl JsReplManager { response: None, error: Some("js_repl execution reset".to_string()), }, - result = JsReplManager::run_tool_request( - ctx, - req, - Arc::clone(&exec_tool_calls_for_task), - ) => result, + result = JsReplManager::run_tool_request(ctx, req) => result, } } None => RunToolResult { @@ -1274,11 +1311,7 @@ impl JsReplManager { } } - async fn run_tool_request( - exec: ExecContext, - req: RunToolRequest, - exec_tool_calls: Arc>>, - ) -> RunToolResult { + async fn run_tool_request(exec: ExecContext, req: RunToolRequest) -> RunToolResult { if is_js_repl_internal_tool(&req.tool_name) { let error = "js_repl cannot invoke itself".to_string(); let summary = Self::summarize_tool_call_error(&error); @@ -1351,15 +1384,6 @@ impl JsReplManager { .await { Ok(response) => { - if let Some(items) = response_content_items(&response) { - Self::record_exec_tool_call_content_items( - &exec_tool_calls, - &req.exec_id, - items, - ) - .await; - } - let summary = Self::summarize_tool_call_response(&response); match serde_json::to_value(response) { Ok(value) => { @@ -1432,24 +1456,23 @@ impl JsReplManager { } } -fn response_content_items( - response: &ResponseInputItem, -) -> Option> { - match response { - ResponseInputItem::FunctionCallOutput { output, .. } - | ResponseInputItem::CustomToolCallOutput { output, .. } => output - .content_items() - .map(<[FunctionCallOutputContentItem]>::to_vec), - ResponseInputItem::McpToolCallOutput { result, .. } => match result { - Ok(result) => FunctionCallOutputPayload::from(result) - .content_items() - .map(<[FunctionCallOutputContentItem]>::to_vec), - Err(_) => None, - }, - ResponseInputItem::Message { .. } => None, +fn emitted_image_content_item( + turn: &TurnContext, + image_url: String, + detail: Option, +) -> FunctionCallOutputContentItem { + FunctionCallOutputContentItem::InputImage { + image_url, + detail: detail.or_else(|| default_output_image_detail_for_turn(turn)), } } +fn default_output_image_detail_for_turn(turn: &TurnContext) -> Option { + (turn.config.features.enabled(Feature::ImageDetailOriginal) + && turn.model_info.supports_image_detail_original) + .then_some(ImageDetail::Original) +} + fn build_exec_result_content_items( output: String, content_items: Vec, @@ -1497,6 +1520,7 @@ enum KernelToHost { error: Option, }, RunTool(RunToolRequest), + EmitImage(EmitImageRequest), } #[derive(Clone, Debug, Serialize)] @@ -1509,6 +1533,7 @@ enum HostToKernel { timeout_ms: Option, }, RunToolResult(RunToolResult), + EmitImageResult(EmitImageResult), } #[derive(Clone, Debug, Deserialize)] @@ -1529,6 +1554,23 @@ struct RunToolResult { error: Option, } +#[derive(Clone, Debug, Deserialize)] +struct EmitImageRequest { + id: String, + exec_id: String, + image_url: String, + #[serde(default)] + detail: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct EmitImageResult { + id: String, + ok: bool, + #[serde(default)] + error: Option, +} + #[derive(Debug)] enum ExecResultMessage { Ok { @@ -1667,6 +1709,7 @@ mod tests { use super::*; use crate::codex::make_session_and_context; use crate::codex::make_session_and_context_with_dynamic_tools_and_rx; + use crate::features::Feature; use crate::protocol::AskForApproval; use crate::protocol::EventMsg; use crate::protocol::SandboxPolicy; @@ -1676,6 +1719,7 @@ mod tests { use codex_protocol::dynamic_tools::DynamicToolSpec; use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; + use codex_protocol::models::ImageDetail; use codex_protocol::models::ResponseInputItem; use codex_protocol::openai_models::InputModality; use pretty_assertions::assert_eq; @@ -1923,6 +1967,43 @@ mod tests { ); } + #[tokio::test] + async fn emitted_image_content_item_preserves_explicit_detail() { + let (_session, turn) = make_session_and_context().await; + let content_item = emitted_image_content_item( + &turn, + "data:image/png;base64,AAA".to_string(), + Some(ImageDetail::Low), + ); + assert_eq!( + content_item, + FunctionCallOutputContentItem::InputImage { + image_url: "data:image/png;base64,AAA".to_string(), + detail: Some(ImageDetail::Low), + } + ); + } + + #[tokio::test] + async fn emitted_image_content_item_uses_turn_original_detail_when_enabled() { + let (_session, mut turn) = make_session_and_context().await; + Arc::make_mut(&mut turn.config) + .features + .enable(Feature::ImageDetailOriginal); + turn.model_info.supports_image_detail_original = true; + + let content_item = + emitted_image_content_item(&turn, "data:image/png;base64,AAA".to_string(), None); + + assert_eq!( + content_item, + FunctionCallOutputContentItem::InputImage { + image_url: "data:image/png;base64,AAA".to_string(), + detail: Some(ImageDetail::Original), + } + ); + } + #[test] fn summarize_tool_call_response_for_multimodal_custom_output() { let response = ResponseInputItem::CustomToolCallOutput { @@ -2361,7 +2442,7 @@ console.log("cell-complete"); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn js_repl_can_attach_image_via_view_image_tool() -> anyhow::Result<()> { + async fn js_repl_does_not_auto_attach_image_via_view_image_tool() -> anyhow::Result<()> { if !can_run_js_repl_runtime_tests().await { return Ok(()); } @@ -2398,7 +2479,65 @@ const png = Buffer.from( await fs.writeFile(imagePath, png); const out = await codex.tool("view_image", { path: imagePath }); console.log(out.type); -console.log(out.output?.body?.text ?? ""); +"#; + + let result = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await?; + assert!(result.output.contains("function_call_output")); + assert!(result.content_items.is_empty()); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_can_emit_image_via_view_image_tool() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, mut turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + turn.approval_policy + .set(AskForApproval::Never) + .expect("test setup should allow updating approval policy"); + turn.sandbox_policy + .set(SandboxPolicy::DangerFullAccess) + .expect("test setup should allow updating sandbox policy"); + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +const fs = await import("node:fs/promises"); +const path = await import("node:path"); +const imagePath = path.join(codex.tmpDir, "js-repl-view-image-explicit.png"); +const png = Buffer.from( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==", + "base64" +); +await fs.writeFile(imagePath, png); +const out = await codex.tool("view_image", { path: imagePath }); +await codex.emitImage(out); +console.log(out.type); "#; let result = manager @@ -2423,20 +2562,374 @@ console.log(out.output?.body?.text ?? ""); }] .as_slice() ); - let [FunctionCallOutputContentItem::InputImage { image_url, detail }] = - result.content_items.as_slice() - else { - panic!("view_image should return exactly one input_image content item"); - }; - assert!(image_url.starts_with("data:image/png;base64,")); - assert_eq!(*detail, None); assert!(session.get_pending_input().await.is_empty()); Ok(()) } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn js_repl_can_attach_image_via_dynamic_tool_with_mixed_content() -> anyhow::Result<()> { + async fn js_repl_can_emit_image_from_bytes_and_mime_type() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +const png = Buffer.from( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==", + "base64" +); +await codex.emitImage({ bytes: png, mimeType: "image/png" }); +"#; + + let result = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await?; + assert_eq!( + result.content_items.as_slice(), + [FunctionCallOutputContentItem::InputImage { + image_url: + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==" + .to_string(), + detail: None, + }] + .as_slice() + ); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_can_emit_multiple_images_in_one_cell() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +await codex.emitImage( + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==" +); +await codex.emitImage( + "data:image/gif;base64,R0lGODdhAQABAIAAAP///////ywAAAAAAQABAAACAkQBADs=" +); +"#; + + let result = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await?; + assert_eq!( + result.content_items.as_slice(), + [ + FunctionCallOutputContentItem::InputImage { + image_url: + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==" + .to_string(), + detail: None, + }, + FunctionCallOutputContentItem::InputImage { + image_url: + "data:image/gif;base64,R0lGODdhAQABAIAAAP///////ywAAAAAAQABAAACAkQBADs=" + .to_string(), + detail: None, + }, + ] + .as_slice() + ); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_waits_for_unawaited_emit_image_before_completion() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +void codex.emitImage( + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==" +); +console.log("cell-complete"); +"#; + + let result = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await?; + assert!(result.output.contains("cell-complete")); + assert_eq!( + result.content_items.as_slice(), + [FunctionCallOutputContentItem::InputImage { + image_url: + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==" + .to_string(), + detail: None, + }] + .as_slice() + ); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_unawaited_emit_image_errors_fail_cell() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +void codex.emitImage({ bytes: new Uint8Array(), mimeType: "image/png" }); +console.log("cell-complete"); +"#; + + let err = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await + .expect_err("unawaited invalid emitImage should fail"); + assert!(err.to_string().contains("expected non-empty bytes")); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_caught_emit_image_error_does_not_fail_cell() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +try { + await codex.emitImage({ bytes: new Uint8Array(), mimeType: "image/png" }); +} catch (error) { + console.log(error.message); +} +console.log("cell-complete"); +"#; + + let result = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await?; + assert!(result.output.contains("expected non-empty bytes")); + assert!(result.output.contains("cell-complete")); + assert!(result.content_items.is_empty()); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_emit_image_requires_explicit_mime_type_for_bytes() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +const png = Buffer.from( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==", + "base64" +); +await codex.emitImage({ bytes: png }); +"#; + + let err = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await + .expect_err("missing mimeType should fail"); + assert!(err.to_string().contains("expected a non-empty mimeType")); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_emit_image_rejects_invalid_detail() -> anyhow::Result<()> { + if !can_run_js_repl_runtime_tests().await { + return Ok(()); + } + + let (session, turn) = make_session_and_context().await; + if !turn + .model_info + .input_modalities + .contains(&InputModality::Image) + { + return Ok(()); + } + + let session = Arc::new(session); + let turn = Arc::new(turn); + *session.active_turn.lock().await = Some(crate::state::ActiveTurn::default()); + + let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::default())); + let manager = turn.js_repl.manager().await?; + let code = r#" +const png = Buffer.from( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==", + "base64" +); +await codex.emitImage({ bytes: png, mimeType: "image/png", detail: "ultra" }); +"#; + + let err = manager + .execute( + Arc::clone(&session), + turn, + tracker, + JsReplArgs { + code: code.to_string(), + timeout_ms: Some(15_000), + }, + ) + .await + .expect_err("invalid detail should fail"); + assert!(err.to_string().contains("expected detail to be one of")); + assert!(session.get_pending_input().await.is_empty()); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn js_repl_emit_image_rejects_mixed_content() -> anyhow::Result<()> { if !can_run_js_repl_runtime_tests().await { return Ok(()); } @@ -2466,7 +2959,7 @@ console.log(out.output?.body?.text ?? ""); let manager = turn.js_repl.manager().await?; let code = r#" const out = await codex.tool("inline_image", {}); -console.log(out.type); +await codex.emitImage(out); "#; let image_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg=="; @@ -2509,19 +3002,10 @@ console.log(out.type); response_watcher, ); response_watcher_result?; - let result = result?; - assert!(result.output.contains("function_call_output")); - assert_eq!( - result.content_items, - vec![ - FunctionCallOutputContentItem::InputText { - text: "inline image note".to_string(), - }, - FunctionCallOutputContentItem::InputImage { - image_url: image_url.to_string(), - detail: None, - }, - ] + let err = result.expect_err("mixed content should fail"); + assert!( + err.to_string() + .contains("does not accept mixed text and image content") ); assert!(session.get_pending_input().await.is_empty()); diff --git a/codex-rs/core/tests/suite/client.rs b/codex-rs/core/tests/suite/client.rs index d4a8ce9f3..d6fde7e69 100644 --- a/codex-rs/core/tests/suite/client.rs +++ b/codex-rs/core/tests/suite/client.rs @@ -21,7 +21,9 @@ use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::Settings; use codex_protocol::config_types::Verbosity; use codex_protocol::models::ContentItem; +use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ImageDetail; use codex_protocol::models::LocalShellAction; use codex_protocol::models::LocalShellExecAction; use codex_protocol::models::LocalShellStatus; @@ -485,6 +487,127 @@ async fn resume_replays_legacy_js_repl_image_rollout_shapes() { assert!(legacy_image_index < new_user_index); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn resume_replays_image_tool_outputs_with_detail() { + skip_if_no_network!(); + + let image_url = "data:image/webp;base64,UklGRiIAAABXRUJQVlA4IBYAAAAwAQCdASoBAAEAAUAmJaACdLoB+AADsAD+8ut//NgVzXPv9//S4P0uD9Lg/9KQAAA="; + let function_call_id = "view-image-call"; + let custom_call_id = "js-repl-call"; + let rollout = vec![ + RolloutLine { + timestamp: "2024-01-01T00:00:00.000Z".to_string(), + item: RolloutItem::SessionMeta(SessionMetaLine { + meta: SessionMeta { + id: ThreadId::default(), + timestamp: "2024-01-01T00:00:00Z".to_string(), + cwd: ".".into(), + originator: "test_originator".to_string(), + cli_version: "test_version".to_string(), + model_provider: Some("test-provider".to_string()), + ..Default::default() + }, + git: None, + }), + }, + RolloutLine { + timestamp: "2024-01-01T00:00:01.000Z".to_string(), + item: RolloutItem::ResponseItem(ResponseItem::FunctionCall { + id: None, + name: "view_image".to_string(), + arguments: "{\"path\":\"/tmp/example.webp\"}".to_string(), + call_id: function_call_id.to_string(), + }), + }, + RolloutLine { + timestamp: "2024-01-01T00:00:01.500Z".to_string(), + item: RolloutItem::ResponseItem(ResponseItem::FunctionCallOutput { + call_id: function_call_id.to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputImage { + image_url: image_url.to_string(), + detail: Some(ImageDetail::Original), + }, + ]), + }), + }, + RolloutLine { + timestamp: "2024-01-01T00:00:02.000Z".to_string(), + item: RolloutItem::ResponseItem(ResponseItem::CustomToolCall { + id: None, + status: Some("completed".to_string()), + call_id: custom_call_id.to_string(), + name: "js_repl".to_string(), + input: "console.log('image flow')".to_string(), + }), + }, + RolloutLine { + timestamp: "2024-01-01T00:00:02.500Z".to_string(), + item: RolloutItem::ResponseItem(ResponseItem::CustomToolCallOutput { + call_id: custom_call_id.to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputImage { + image_url: image_url.to_string(), + detail: Some(ImageDetail::Original), + }, + ]), + }), + }, + ]; + + let tmpdir = TempDir::new().unwrap(); + let session_path = tmpdir + .path() + .join("resume-image-tool-outputs-with-detail.jsonl"); + let mut file = std::fs::File::create(&session_path).unwrap(); + for line in rollout { + writeln!(file, "{}", serde_json::to_string(&line).unwrap()).unwrap(); + } + + let server = MockServer::start().await; + let resp_mock = mount_sse_once( + &server, + sse(vec![ev_response_created("resp1"), ev_completed("resp1")]), + ) + .await; + + let codex_home = Arc::new(TempDir::new().unwrap()); + let mut builder = test_codex().with_model("gpt-5.1"); + let test = builder + .resume(&server, codex_home, session_path.clone()) + .await + .expect("resume conversation"); + test.submit_turn("after resume").await.unwrap(); + + let function_output = resp_mock + .single_request() + .function_call_output(function_call_id); + assert_eq!( + function_output.get("output"), + Some(&serde_json::json!([ + { + "type": "input_image", + "image_url": image_url, + "detail": "original" + } + ])) + ); + + let custom_output = resp_mock + .single_request() + .custom_tool_call_output(custom_call_id); + assert_eq!( + custom_output.get("output"), + Some(&serde_json::json!([ + { + "type": "input_image", + "image_url": image_url, + "detail": "original" + } + ])) + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn includes_conversation_id_and_model_headers_in_request() { skip_if_no_network!(); diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs index 9b73afbc1..b56e28d02 100644 --- a/codex-rs/core/tests/suite/view_image.rs +++ b/codex-rs/core/tests/suite/view_image.rs @@ -494,7 +494,7 @@ async fn view_image_tool_keeps_legacy_behavior_below_gpt5_3_codex() -> anyhow::R } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn js_repl_view_image_tool_attaches_local_image() -> anyhow::Result<()> { +async fn js_repl_emit_image_attaches_local_image() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); let server = start_mock_server().await; @@ -519,7 +519,7 @@ const png = Buffer.from( ); await fs.writeFile(imagePath, png); const out = await codex.tool("view_image", { path: imagePath }); -console.log(out.output?.body?.text ?? ""); +await codex.emitImage(out); "#; let first_response = sse(vec![ @@ -555,12 +555,29 @@ console.log(out.output?.body?.text ?? ""); }) .await?; + let mut tool_event = None; wait_for_event_with_timeout( &codex, - |event| matches!(event, EventMsg::TurnComplete(_)), + |event| match event { + EventMsg::ViewImageToolCall(_) => { + tool_event = Some(event.clone()); + false + } + EventMsg::TurnComplete(_) => true, + _ => false, + }, Duration::from_secs(10), ) .await; + let tool_event = match tool_event { + Some(EventMsg::ViewImageToolCall(event)) => event, + other => panic!("expected ViewImageToolCall event, got {other:?}"), + }; + assert!( + tool_event.path.ends_with("js-repl-view-image.png"), + "unexpected image path: {}", + tool_event.path.display() + ); let req = mock.single_request(); let body = req.body_json(); @@ -591,6 +608,105 @@ console.log(out.output?.body?.text ?? ""); Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn js_repl_view_image_requires_explicit_emit() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_mock_server().await; + let mut builder = test_codex().with_config(|config| { + config.features.enable(Feature::JsRepl); + }); + let TestCodex { + codex, + cwd, + session_configured, + .. + } = builder.build(&server).await?; + + let call_id = "js-repl-view-image-no-emit"; + let js_input = r#" +const fs = await import("node:fs/promises"); +const path = await import("node:path"); +const imagePath = path.join(codex.tmpDir, "js-repl-view-image-no-emit.png"); +const png = Buffer.from( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==", + "base64" +); +await fs.writeFile(imagePath, png); +const out = await codex.tool("view_image", { path: imagePath }); +console.log(out.type); +"#; + + let first_response = sse(vec![ + ev_response_created("resp-1"), + ev_custom_tool_call(call_id, "js_repl", js_input), + ev_completed("resp-1"), + ]); + responses::mount_sse_once(&server, first_response).await; + + let second_response = sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]); + let mock = responses::mount_sse_once(&server, second_response).await; + + let session_model = session_configured.model.clone(); + codex + .submit(Op::UserTurn { + items: vec![UserInput::Text { + text: "use js_repl to write an image but do not emit it".into(), + text_elements: Vec::new(), + }], + final_output_json_schema: None, + cwd: cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy: SandboxPolicy::DangerFullAccess, + model: session_model, + effort: None, + service_tier: None, + summary: None, + collaboration_mode: None, + personality: None, + }) + .await?; + + let mut tool_event = None; + wait_for_event_with_timeout( + &codex, + |event| match event { + EventMsg::ViewImageToolCall(_) => { + tool_event = Some(event.clone()); + false + } + EventMsg::TurnComplete(_) => true, + _ => false, + }, + Duration::from_secs(10), + ) + .await; + let tool_event = match tool_event { + Some(EventMsg::ViewImageToolCall(event)) => event, + other => panic!("expected ViewImageToolCall event, got {other:?}"), + }; + assert!( + tool_event.path.ends_with("js-repl-view-image-no-emit.png"), + "unexpected image path: {}", + tool_event.path.display() + ); + + let req = mock.single_request(); + let custom_output = req.custom_tool_call_output(call_id); + let output_items = custom_output.get("output").and_then(Value::as_array); + assert!( + output_items.is_none_or(|items| items + .iter() + .all(|item| item.get("type").and_then(Value::as_str) != Some("input_image"))), + "nested view_image should not auto-populate js_repl output" + ); + + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn view_image_tool_errors_when_path_is_directory() -> anyhow::Result<()> { skip_if_no_network!(Ok(())); diff --git a/docs/js_repl.md b/docs/js_repl.md index b36da5dc0..dd3390290 100644 --- a/docs/js_repl.md +++ b/docs/js_repl.md @@ -65,8 +65,13 @@ For `CODEX_JS_REPL_NODE_MODULE_DIRS` and `js_repl_node_module_dirs`, module reso - `codex.tmpDir`: per-session scratch directory path. - `codex.tool(name, args?)`: executes a normal Codex tool call from inside `js_repl` (including shell tools like `shell` / `shell_command` when available). +- `codex.emitImage(imageLike)`: explicitly adds exactly one image to the outer `js_repl` function output. - Each `codex.tool(...)` call emits a bounded summary at `info` level from the `codex_core::tools::js_repl` logger. At `trace` level, the same path also logs the exact raw response object or error string seen by JavaScript. -- To share generated images with the model, write a file under `codex.tmpDir`, call `await codex.tool("view_image", { path: "/absolute/path" })`, then delete the file. +- Nested `codex.tool(...)` outputs stay inside JavaScript unless you emit them explicitly. +- `codex.emitImage(...)` accepts a direct image URL, a single `input_image` item, an object like `{ bytes, mimeType }`, or a raw tool response object that contains exactly one image and no text. +- `codex.emitImage(...)` rejects mixed text-and-image content. +- Example of sharing an in-memory Playwright screenshot: `await codex.emitImage({ bytes: await page.screenshot({ type: "jpeg", quality: 85 }), mimeType: "image/jpeg" })`. +- Example of sharing a local image tool result: `await codex.emitImage(codex.tool("view_image", { path: "/absolute/path" }))`. Avoid writing directly to `process.stdout` / `process.stderr` / `process.stdin`; the kernel uses a JSON-line transport over stdio.