From 07c22d20f614838dbec1bc8066ec0a23f5e90f2a Mon Sep 17 00:00:00 2001
From: pakrym-oai <pakrym@openai.com>
Date: Tue, 10 Mar 2026 16:25:27 -0700
Subject: [PATCH] Add code_mode output helpers for text and images (#14244)

Summary
- document how code-mode can import `output_text`/`output_image` and
ensure `add_content` stays compatible
- add a synthetic `@openai/code_mode` module that appends content items
and validates inputs
- cover the new behavior with integration tests for structured text and
image outputs

Testing
- Not run (not requested)
---
 codex-rs/core/src/tools/code_mode.rs         |   6 +-
 codex-rs/core/src/tools/code_mode_bridge.js  |   4 +-
 codex-rs/core/src/tools/code_mode_runner.cjs |  87 ++++++++++++---
 codex-rs/core/src/tools/spec.rs              |   2 +-
 codex-rs/core/tests/suite/code_mode.rs       | 107 +++++++++++++++++++
 5 files changed, 187 insertions(+), 19 deletions(-)
diff --git a/codex-rs/core/src/tools/code_mode.rs b/codex-rs/core/src/tools/code_mode.rs
index cc6c0af07..abe11b248 100644
--- a/codex-rs/core/src/tools/code_mode.rs
+++ b/codex-rs/core/src/tools/code_mode.rs
@@ -94,13 +94,13 @@ pub(crate) fn instructions(config: &Config) -> Option<String> {
     section.push_str("- Direct tool calls remain available while `code_mode` is enabled.\n");
     section.push_str("- `code_mode` uses the same Node runtime resolution as `js_repl`. If needed, point `js_repl_node_path` at the Node binary you want Codex to use.\n");
     section.push_str("- Import nested tools from `tools.js`, for example `import { exec_command } from \"tools.js\"` or `import { tools } from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import { append_notebook_logs_chart } from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await exec_command(args)` remain available for compatibility. Nested tool calls resolve to their code-mode result values.\n");
-    section.push_str("- Import `set_max_output_tokens_per_exec_call` from `@openai/code_mode` to set the token budget used to truncate the final Rust-side result of the current `code_mode` execution. The default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker.\n");
+    section.push_str("- Import `{ output_text, output_image, set_max_output_tokens_per_exec_call }` from `@openai/code_mode`. `output_text(value)` surfaces text back to the model and stringifies non-string objects with `JSON.stringify(...)` when possible. `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs. `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `code_mode` execution; the default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker.\n");
     section.push_str(
         "- Function tools require JSON object arguments. Freeform tools require raw strings.\n",
     );
-    section.push_str("- `add_content(value)` is synchronous. It accepts a content item, an array of content items, or a string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`.\n");
+    section.push_str("- `add_content(value)` remains available for compatibility. It is synchronous and accepts a content item, an array of content items, or a string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`.\n");
     section
-        .push_str("- Only content passed to `add_content(value)` is surfaced back to the model.");
+        .push_str("- Only content passed to `output_text(...)`, `output_image(...)`, or `add_content(value)` is surfaced back to the model.");
     Some(section)
 }
 
diff --git a/codex-rs/core/src/tools/code_mode_bridge.js b/codex-rs/core/src/tools/code_mode_bridge.js
index dcc9bc5bc..362fc985b 100644
--- a/codex-rs/core/src/tools/code_mode_bridge.js
+++ b/codex-rs/core/src/tools/code_mode_bridge.js
@@ -1,6 +1,8 @@
 const __codexEnabledTools = __CODE_MODE_ENABLED_TOOLS_PLACEHOLDER__;
 const __codexEnabledToolNames = __codexEnabledTools.map((tool) => tool.tool_name);
-const __codexContentItems = [];
+const __codexContentItems = Array.isArray(globalThis.__codexContentItems)
+  ? globalThis.__codexContentItems
+  : [];
 
 function __codexCloneContentItem(item) {
   if (!item || typeof item !== 'object') {
diff --git a/codex-rs/core/src/tools/code_mode_runner.cjs b/codex-rs/core/src/tools/code_mode_runner.cjs
index e66f1dffd..e66f9bdb7 100644
--- a/codex-rs/core/src/tools/code_mode_runner.cjs
+++ b/codex-rs/core/src/tools/code_mode_runner.cjs
@@ -157,6 +157,78 @@ function createToolsModule(context, callTool, enabledTools) {
   );
 }
 
+function ensureContentItems(context) {
+  if (!Array.isArray(context.__codexContentItems)) {
+    context.__codexContentItems = [];
+  }
+  return context.__codexContentItems;
+}
+
+function serializeOutputText(value) {
+  if (typeof value === 'string') {
+    return value;
+  }
+  if (
+    typeof value === 'undefined' ||
+    value === null ||
+    typeof value === 'boolean' ||
+    typeof value === 'number' ||
+    typeof value === 'bigint'
+  ) {
+    return String(value);
+  }
+
+  const serialized = JSON.stringify(value);
+  if (typeof serialized === 'string') {
+    return serialized;
+  }
+
+  return String(value);
+}
+
+function normalizeOutputImageUrl(value) {
+  if (typeof value !== 'string' || !value) {
+    throw new TypeError('output_image expects a non-empty image URL string');
+  }
+  if (/^(?:https?:\/\/|data:)/i.test(value)) {
+    return value;
+  }
+  throw new TypeError('output_image expects an http(s) or data URL');
+}
+
+function createCodeModeModule(context, state) {
+  const outputText = (value) => {
+    const item = {
+      type: 'input_text',
+      text: serializeOutputText(value),
+    };
+    ensureContentItems(context).push(item);
+    return item;
+  };
+  const outputImage = (value) => {
+    const item = {
+      type: 'input_image',
+      image_url: normalizeOutputImageUrl(value),
+    };
+    ensureContentItems(context).push(item);
+    return item;
+  };
+
+  return new SyntheticModule(
+    ['output_text', 'output_image', 'set_max_output_tokens_per_exec_call'],
+    function initCodeModeModule() {
+      this.setExport('output_text', outputText);
+      this.setExport('output_image', outputImage);
+      this.setExport('set_max_output_tokens_per_exec_call', (value) => {
+        const normalized = normalizeMaxOutputTokensPerExecCall(value);
+        state.maxOutputTokensPerExecCall = normalized;
+        return normalized;
+      });
+    },
+    { context }
+  );
+}
+
 function namespacesMatch(left, right) {
   if (left.length !== right.length) {
     return false;
@@ -210,20 +282,6 @@ function createNamespacedToolsModule(context, callTool, enabledTools, namespace)
   );
 }
 
-function createCodeModeModule(context, state) {
-  return new SyntheticModule(
-    ['set_max_output_tokens_per_exec_call'],
-    function initCodeModeModule() {
-      this.setExport('set_max_output_tokens_per_exec_call', (value) => {
-        const normalized = normalizeMaxOutputTokensPerExecCall(value);
-        state.maxOutputTokensPerExecCall = normalized;
-        return normalized;
-      });
-    },
-    { context }
-  );
-}
-
 function createModuleResolver(context, callTool, enabledTools, state) {
   const toolsModule = createToolsModule(context, callTool, enabledTools);
   const codeModeModule = createCodeModeModule(context, state);
@@ -285,6 +343,7 @@ async function main() {
   };
   const callTool = createToolCaller(protocol);
   const context = vm.createContext({
+    __codexContentItems: [],
     __codex_tool_call: callTool,
   });
 
diff --git a/codex-rs/core/src/tools/spec.rs b/codex-rs/core/src/tools/spec.rs
index 3a9c9eb58..e303a22df 100644
--- a/codex-rs/core/src/tools/spec.rs
+++ b/codex-rs/core/src/tools/spec.rs
@@ -1620,7 +1620,7 @@ source: /[\s\S]+/
         enabled_tool_names.join(", ")
     };
     let description = format!(
-        "Runs JavaScript in a Node-backed `node:vm` context. This is a freeform tool: send raw JavaScript source text (no JSON/quotes/markdown fences). Direct tool calls remain available while `code_mode` is enabled. Inside JavaScript, import nested tools from `tools.js`, for example `import {{ exec_command }} from \"tools.js\"` or `import {{ tools }} from \"tools.js\"`. `tools[name]` and identifier wrappers like `await shell(args)` remain available for compatibility when the tool name is a valid JS identifier. Nested tool calls resolve to their code-mode result values. Import `set_max_output_tokens_per_exec_call` from `@openai/code_mode` to set the token budget used to truncate the final Rust-side result of the current `code_mode` execution; the default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker. Function tools require JSON object arguments. Freeform tools require raw strings. Use synchronous `add_content(value)` with a content item, content-item array, or string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`. Only content passed to `add_content(value)` is surfaced back to the model. Enabled nested tools: {enabled_list}."
+        "Runs JavaScript in a Node-backed `node:vm` context. This is a freeform tool: send raw JavaScript source text (no JSON/quotes/markdown fences). Direct tool calls remain available while `code_mode` is enabled. Inside JavaScript, import nested tools from `tools.js`, for example `import {{ exec_command }} from \"tools.js\"` or `import {{ tools }} from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import {{ append_notebook_logs_chart }} from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await shell(args)` remain available for compatibility when the tool name is a valid JS identifier. Nested tool calls resolve to their code-mode result values. Import `{{ output_text, output_image, set_max_output_tokens_per_exec_call }}` from `\"@openai/code_mode\"`; `output_text(value)` surfaces text back to the model and stringifies non-string objects when possible, `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs, and `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `code_mode` execution. The default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker. Function tools require JSON object arguments. Freeform tools require raw strings. `add_content(value)` remains available for compatibility with a content item, content-item array, or string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`. Only content passed to `output_text(...)`, `output_image(...)`, or `add_content(value)` is surfaced back to the model. Enabled nested tools: {enabled_list}."
     );
 
     ToolSpec::Freeform(FreeformTool {
diff --git a/codex-rs/core/tests/suite/code_mode.rs b/codex-rs/core/tests/suite/code_mode.rs
index 389c81bf4..4aca988ed 100644
--- a/codex-rs/core/tests/suite/code_mode.rs
+++ b/codex-rs/core/tests/suite/code_mode.rs
@@ -221,6 +221,113 @@ Total\ output\ lines:\ 1\n
     Ok(())
 }
 
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_can_output_serialized_text_via_openai_code_mode_module() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use code_mode to return structured text",
+        r#"
+import { output_text } from "@openai/code_mode";
+
+output_text({ json: true });
+"#,
+        false,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
+    assert_ne!(
+        success,
+        Some(false),
+        "code_mode call failed unexpectedly: {output}"
+    );
+    assert_eq!(output, r#"{"json":true}"#);
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_surfaces_output_text_stringify_errors() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use code_mode to return circular text",
+        r#"
+import { output_text } from "@openai/code_mode";
+
+const circular = {};
+circular.self = circular;
+output_text(circular);
+"#,
+        false,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
+    assert_ne!(
+        success,
+        Some(true),
+        "circular stringify unexpectedly succeeded"
+    );
+    assert!(output.contains("code_mode execution failed"));
+    assert!(output.contains("Converting circular structure to JSON"));
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_can_output_images_via_openai_code_mode_module() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use code_mode to return images",
+        r#"
+import { output_image } from "@openai/code_mode";
+
+output_image("https://example.com/image.jpg");
+output_image("data:image/png;base64,AAA");
+"#,
+        false,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let (_, success) = custom_tool_output_text_and_success(&req, "call-1");
+    assert_ne!(
+        success,
+        Some(false),
+        "code_mode image output failed unexpectedly"
+    );
+    assert_eq!(
+        req.custom_tool_call_output("call-1"),
+        serde_json::json!({
+            "type": "custom_tool_call_output",
+            "call_id": "call-1",
+            "output": [
+                {
+                    "type": "input_image",
+                    "image_url": "https://example.com/image.jpg"
+                },
+                {
+                    "type": "input_image",
+                    "image_url": "data:image/png;base64,AAA"
+                }
+            ]
+        })
+    );
+
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn code_mode_can_apply_patch_via_nested_tool() -> Result<()> {
     skip_if_no_network!(Ok(()));