Add code_mode output helpers for text and images (#14244)

Summary
- document how code-mode can import `output_text`/`output_image` and
ensure `add_content` stays compatible
- add a synthetic `@openai/code_mode` module that appends content items
and validates inputs
- cover the new behavior with integration tests for structured text and
image outputs

Testing
- Not run (not requested)
This commit is contained in:
pakrym-oai 2026-03-10 16:25:27 -07:00 committed by Michael Bolin
parent ce1d9abf11
commit 07c22d20f6
5 changed files with 187 additions and 19 deletions

View file

@ -94,13 +94,13 @@ pub(crate) fn instructions(config: &Config) -> Option<String> {
section.push_str("- Direct tool calls remain available while `code_mode` is enabled.\n");
section.push_str("- `code_mode` uses the same Node runtime resolution as `js_repl`. If needed, point `js_repl_node_path` at the Node binary you want Codex to use.\n");
section.push_str("- Import nested tools from `tools.js`, for example `import { exec_command } from \"tools.js\"` or `import { tools } from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import { append_notebook_logs_chart } from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await exec_command(args)` remain available for compatibility. Nested tool calls resolve to their code-mode result values.\n");
section.push_str("- Import `set_max_output_tokens_per_exec_call` from `@openai/code_mode` to set the token budget used to truncate the final Rust-side result of the current `code_mode` execution. The default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker.\n");
section.push_str("- Import `{ output_text, output_image, set_max_output_tokens_per_exec_call }` from `@openai/code_mode`. `output_text(value)` surfaces text back to the model and stringifies non-string objects with `JSON.stringify(...)` when possible. `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs. `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `code_mode` execution; the default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker.\n");
section.push_str(
"- Function tools require JSON object arguments. Freeform tools require raw strings.\n",
);
section.push_str("- `add_content(value)` is synchronous. It accepts a content item, an array of content items, or a string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`.\n");
section.push_str("- `add_content(value)` remains available for compatibility. It is synchronous and accepts a content item, an array of content items, or a string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`.\n");
section
.push_str("- Only content passed to `add_content(value)` is surfaced back to the model.");
.push_str("- Only content passed to `output_text(...)`, `output_image(...)`, or `add_content(value)` is surfaced back to the model.");
Some(section)
}

View file

@ -1,6 +1,8 @@
const __codexEnabledTools = __CODE_MODE_ENABLED_TOOLS_PLACEHOLDER__;
const __codexEnabledToolNames = __codexEnabledTools.map((tool) => tool.tool_name);
const __codexContentItems = [];
const __codexContentItems = Array.isArray(globalThis.__codexContentItems)
? globalThis.__codexContentItems
: [];
function __codexCloneContentItem(item) {
if (!item || typeof item !== 'object') {

View file

@ -157,6 +157,78 @@ function createToolsModule(context, callTool, enabledTools) {
);
}
function ensureContentItems(context) {
if (!Array.isArray(context.__codexContentItems)) {
context.__codexContentItems = [];
}
return context.__codexContentItems;
}
function serializeOutputText(value) {
if (typeof value === 'string') {
return value;
}
if (
typeof value === 'undefined' ||
value === null ||
typeof value === 'boolean' ||
typeof value === 'number' ||
typeof value === 'bigint'
) {
return String(value);
}
const serialized = JSON.stringify(value);
if (typeof serialized === 'string') {
return serialized;
}
return String(value);
}
function normalizeOutputImageUrl(value) {
if (typeof value !== 'string' || !value) {
throw new TypeError('output_image expects a non-empty image URL string');
}
if (/^(?:https?:\/\/|data:)/i.test(value)) {
return value;
}
throw new TypeError('output_image expects an http(s) or data URL');
}
function createCodeModeModule(context, state) {
const outputText = (value) => {
const item = {
type: 'input_text',
text: serializeOutputText(value),
};
ensureContentItems(context).push(item);
return item;
};
const outputImage = (value) => {
const item = {
type: 'input_image',
image_url: normalizeOutputImageUrl(value),
};
ensureContentItems(context).push(item);
return item;
};
return new SyntheticModule(
['output_text', 'output_image', 'set_max_output_tokens_per_exec_call'],
function initCodeModeModule() {
this.setExport('output_text', outputText);
this.setExport('output_image', outputImage);
this.setExport('set_max_output_tokens_per_exec_call', (value) => {
const normalized = normalizeMaxOutputTokensPerExecCall(value);
state.maxOutputTokensPerExecCall = normalized;
return normalized;
});
},
{ context }
);
}
function namespacesMatch(left, right) {
if (left.length !== right.length) {
return false;
@ -210,20 +282,6 @@ function createNamespacedToolsModule(context, callTool, enabledTools, namespace)
);
}
function createCodeModeModule(context, state) {
return new SyntheticModule(
['set_max_output_tokens_per_exec_call'],
function initCodeModeModule() {
this.setExport('set_max_output_tokens_per_exec_call', (value) => {
const normalized = normalizeMaxOutputTokensPerExecCall(value);
state.maxOutputTokensPerExecCall = normalized;
return normalized;
});
},
{ context }
);
}
function createModuleResolver(context, callTool, enabledTools, state) {
const toolsModule = createToolsModule(context, callTool, enabledTools);
const codeModeModule = createCodeModeModule(context, state);
@ -285,6 +343,7 @@ async function main() {
};
const callTool = createToolCaller(protocol);
const context = vm.createContext({
__codexContentItems: [],
__codex_tool_call: callTool,
});

View file

@ -1620,7 +1620,7 @@ source: /[\s\S]+/
enabled_tool_names.join(", ")
};
let description = format!(
"Runs JavaScript in a Node-backed `node:vm` context. This is a freeform tool: send raw JavaScript source text (no JSON/quotes/markdown fences). Direct tool calls remain available while `code_mode` is enabled. Inside JavaScript, import nested tools from `tools.js`, for example `import {{ exec_command }} from \"tools.js\"` or `import {{ tools }} from \"tools.js\"`. `tools[name]` and identifier wrappers like `await shell(args)` remain available for compatibility when the tool name is a valid JS identifier. Nested tool calls resolve to their code-mode result values. Import `set_max_output_tokens_per_exec_call` from `@openai/code_mode` to set the token budget used to truncate the final Rust-side result of the current `code_mode` execution; the default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker. Function tools require JSON object arguments. Freeform tools require raw strings. Use synchronous `add_content(value)` with a content item, content-item array, or string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`. Only content passed to `add_content(value)` is surfaced back to the model. Enabled nested tools: {enabled_list}."
"Runs JavaScript in a Node-backed `node:vm` context. This is a freeform tool: send raw JavaScript source text (no JSON/quotes/markdown fences). Direct tool calls remain available while `code_mode` is enabled. Inside JavaScript, import nested tools from `tools.js`, for example `import {{ exec_command }} from \"tools.js\"` or `import {{ tools }} from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import {{ append_notebook_logs_chart }} from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await shell(args)` remain available for compatibility when the tool name is a valid JS identifier. Nested tool calls resolve to their code-mode result values. Import `{{ output_text, output_image, set_max_output_tokens_per_exec_call }}` from `\"@openai/code_mode\"`; `output_text(value)` surfaces text back to the model and stringifies non-string objects when possible, `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs, and `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `code_mode` execution. The default is `10000`. This guards the overall `code_mode` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker. Function tools require JSON object arguments. Freeform tools require raw strings. `add_content(value)` remains available for compatibility with a content item, content-item array, or string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`. Only content passed to `output_text(...)`, `output_image(...)`, or `add_content(value)` is surfaced back to the model. Enabled nested tools: {enabled_list}."
);
ToolSpec::Freeform(FreeformTool {

View file

@ -221,6 +221,113 @@ Total\ output\ lines:\ 1\n
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_output_serialized_text_via_openai_code_mode_module() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
&server,
"use code_mode to return structured text",
r#"
import { output_text } from "@openai/code_mode";
output_text({ json: true });
"#,
false,
)
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"code_mode call failed unexpectedly: {output}"
);
assert_eq!(output, r#"{"json":true}"#);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_surfaces_output_text_stringify_errors() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
&server,
"use code_mode to return circular text",
r#"
import { output_text } from "@openai/code_mode";
const circular = {};
circular.self = circular;
output_text(circular);
"#,
false,
)
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(true),
"circular stringify unexpectedly succeeded"
);
assert!(output.contains("code_mode execution failed"));
assert!(output.contains("Converting circular structure to JSON"));
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_output_images_via_openai_code_mode_module() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
&server,
"use code_mode to return images",
r#"
import { output_image } from "@openai/code_mode";
output_image("https://example.com/image.jpg");
output_image("data:image/png;base64,AAA");
"#,
false,
)
.await?;
let req = second_mock.single_request();
let (_, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"code_mode image output failed unexpectedly"
);
assert_eq!(
req.custom_tool_call_output("call-1"),
serde_json::json!({
"type": "custom_tool_call_output",
"call_id": "call-1",
"output": [
{
"type": "input_image",
"image_url": "https://example.com/image.jpg"
},
{
"type": "input_image",
"image_url": "data:image/png;base64,AAA"
}
]
})
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_apply_patch_via_nested_tool() -> Result<()> {
skip_if_no_network!(Ok(()));