Support multimodal custom tool outputs (#12948)

## Summary

This changes `custom_tool_call_output` to use the same output payload
shape as `function_call_output`, so freeform tools can return either
plain text or structured content items.

The main goal is to let `js_repl` return image content from nested
`view_image` calls in its own `custom_tool_call_output`, instead of
relying on a separate injected message.

## What changed

- Changed `custom_tool_call_output.output` from `string` to
`FunctionCallOutputPayload`
- Updated freeform tool plumbing to preserve structured output bodies
- Updated `js_repl` to aggregate nested tool content items and attach
them to the outer `js_repl` result
- Removed the old `js_repl` special case that injected `view_image`
results as a separate pending user image message
- Updated normalization/history/truncation paths to handle multimodal
`custom_tool_call_output`
- Regenerated app-server protocol schema artifacts

## Behavior

Direct `view_image` calls still return a `function_call_output` with
image content.

When `view_image` is called inside `js_repl`, the outer `js_repl`
`custom_tool_call_output` now carries:
- an `input_text` item if the JS produced text output
- one or more `input_image` items from nested tool results

So the nested image result now stays inside the `js_repl` tool output
instead of being injected as a separate message.

## Compatibility

This is intended to be backward-compatible for resumed conversations.

Older histories that stored `custom_tool_call_output.output` as a plain
string still deserialize correctly, and older histories that used the
previous injected-image-message flow also continue to resume.

Added regression coverage for resuming a pre-change rollout containing:
- string-valued `custom_tool_call_output`
- legacy injected image message history


#### [git stack](https://github.com/magus/git-stack-cli)
- 👉 `1` https://github.com/openai/codex/pull/12948
This commit is contained in:
Curtis 'Fjord' Hawthorne 2026-02-26 18:17:46 -08:00 committed by GitHub
parent f90e97e414
commit 7e980d7db6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 688 additions and 177 deletions

View file

@ -1340,7 +1340,7 @@
"type": "string"
},
"output": {
"type": "string"
"$ref": "#/definitions/FunctionCallOutputPayload"
},
"type": {
"enum": [

View file

@ -4822,7 +4822,7 @@
"type": "string"
},
"output": {
"type": "string"
"$ref": "#/definitions/FunctionCallOutputPayload"
},
"type": {
"enum": [

View file

@ -11421,7 +11421,7 @@
"type": "string"
},
"output": {
"type": "string"
"$ref": "#/definitions/v2/FunctionCallOutputPayload"
},
"type": {
"enum": [

View file

@ -565,7 +565,7 @@
"type": "string"
},
"output": {
"type": "string"
"$ref": "#/definitions/FunctionCallOutputPayload"
},
"type": {
"enum": [

View file

@ -615,7 +615,7 @@
"type": "string"
},
"output": {
"type": "string"
"$ref": "#/definitions/FunctionCallOutputPayload"
},
"type": {
"enum": [

View file

@ -15,4 +15,4 @@ export type ResponseItem = { "type": "message", role: string, content: Array<Con
/**
* Set when using the Responses API.
*/
call_id: string | null, status: LocalShellStatus, action: LocalShellAction, } | { "type": "function_call", name: string, arguments: string, call_id: string, } | { "type": "function_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "custom_tool_call", status?: string, call_id: string, name: string, input: string, } | { "type": "custom_tool_call_output", call_id: string, output: string, } | { "type": "web_search_call", status?: string, action?: WebSearchAction, } | { "type": "ghost_snapshot", ghost_commit: GhostCommit, } | { "type": "compaction", encrypted_content: string, } | { "type": "other" };
call_id: string | null, status: LocalShellStatus, action: LocalShellAction, } | { "type": "function_call", name: string, arguments: string, call_id: string, } | { "type": "function_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "custom_tool_call", status?: string, call_id: string, name: string, input: string, } | { "type": "custom_tool_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "web_search_call", status?: string, action?: WebSearchAction, } | { "type": "ghost_snapshot", ghost_commit: GhostCommit, } | { "type": "compaction", encrypted_content: string, } | { "type": "other" };

View file

@ -84,19 +84,13 @@ fn reserialize_shell_outputs(items: &mut [ResponseItem]) {
shell_call_ids.insert(call_id.clone());
}
}
ResponseItem::CustomToolCallOutput { call_id, output } => {
if shell_call_ids.remove(call_id)
&& let Some(structured) = parse_structured_shell_output(output)
{
*output = structured
}
}
ResponseItem::FunctionCall { name, call_id, .. }
if is_shell_tool_name(name) || name == "apply_patch" =>
{
shell_call_ids.insert(call_id.clone());
}
ResponseItem::FunctionCallOutput { call_id, output } => {
ResponseItem::FunctionCallOutput { call_id, output }
| ResponseItem::CustomToolCallOutput { call_id, output } => {
if shell_call_ids.remove(call_id)
&& let Some(structured) = output
.text_content()
@ -240,6 +234,7 @@ mod tests {
use codex_api::common::OpenAiVerbosity;
use codex_api::common::TextControls;
use codex_api::create_text_param_for_request;
use codex_protocol::models::FunctionCallOutputPayload;
use pretty_assertions::assert_eq;
use super::*;
@ -343,4 +338,62 @@ mod tests {
let v = serde_json::to_value(&req).expect("json");
assert!(v.get("text").is_none());
}
#[test]
fn reserializes_shell_outputs_for_function_and_custom_tool_calls() {
let raw_output = r#"{"output":"hello","metadata":{"exit_code":0,"duration_seconds":0.5}}"#;
let expected_output = "Exit code: 0\nWall time: 0.5 seconds\nOutput:\nhello";
let mut items = vec![
ResponseItem::FunctionCall {
id: None,
name: "shell".to_string(),
arguments: "{}".to_string(),
call_id: "call-1".to_string(),
},
ResponseItem::FunctionCallOutput {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload::from_text(raw_output.to_string()),
},
ResponseItem::CustomToolCall {
id: None,
status: None,
call_id: "call-2".to_string(),
name: "apply_patch".to_string(),
input: "*** Begin Patch".to_string(),
},
ResponseItem::CustomToolCallOutput {
call_id: "call-2".to_string(),
output: FunctionCallOutputPayload::from_text(raw_output.to_string()),
},
];
reserialize_shell_outputs(&mut items);
assert_eq!(
items,
vec![
ResponseItem::FunctionCall {
id: None,
name: "shell".to_string(),
arguments: "{}".to_string(),
call_id: "call-1".to_string(),
},
ResponseItem::FunctionCallOutput {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload::from_text(expected_output.to_string()),
},
ResponseItem::CustomToolCall {
id: None,
status: None,
call_id: "call-2".to_string(),
name: "apply_patch".to_string(),
input: "*** Begin Patch".to_string(),
},
ResponseItem::CustomToolCallOutput {
call_id: "call-2".to_string(),
output: FunctionCallOutputPayload::from_text(expected_output.to_string()),
},
]
);
}
}

View file

@ -344,32 +344,21 @@ impl ContextManager {
let policy_with_serialization_budget = policy * 1.2;
match item {
ResponseItem::FunctionCallOutput { call_id, output } => {
let body = match &output.body {
FunctionCallOutputBody::Text(content) => FunctionCallOutputBody::Text(
truncate_text(content, policy_with_serialization_budget),
),
FunctionCallOutputBody::ContentItems(items) => {
FunctionCallOutputBody::ContentItems(
truncate_function_output_items_with_policy(
items,
policy_with_serialization_budget,
),
)
}
};
ResponseItem::FunctionCallOutput {
call_id: call_id.clone(),
output: FunctionCallOutputPayload {
body,
success: output.success,
},
output: truncate_function_output_payload(
output,
policy_with_serialization_budget,
),
}
}
ResponseItem::CustomToolCallOutput { call_id, output } => {
let truncated = truncate_text(output, policy_with_serialization_budget);
ResponseItem::CustomToolCallOutput {
call_id: call_id.clone(),
output: truncated,
output: truncate_function_output_payload(
output,
policy_with_serialization_budget,
),
}
}
ResponseItem::Message { .. }
@ -385,6 +374,25 @@ impl ContextManager {
}
}
fn truncate_function_output_payload(
output: &FunctionCallOutputPayload,
policy: TruncationPolicy,
) -> FunctionCallOutputPayload {
let body = match &output.body {
FunctionCallOutputBody::Text(content) => {
FunctionCallOutputBody::Text(truncate_text(content, policy))
}
FunctionCallOutputBody::ContentItems(items) => FunctionCallOutputBody::ContentItems(
truncate_function_output_items_with_policy(items, policy),
),
};
FunctionCallOutputPayload {
body,
success: output.success,
}
}
/// API messages include every non-system item (user/assistant messages, reasoning,
/// tool calls, tool outputs, shell calls, and web-search calls).
fn is_api_message(message: &ResponseItem) -> bool {
@ -508,7 +516,8 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
}
}
}
ResponseItem::FunctionCallOutput { output, .. } => {
ResponseItem::FunctionCallOutput { output, .. }
| ResponseItem::CustomToolCallOutput { output, .. } => {
if let FunctionCallOutputBody::ContentItems(items) = &output.body {
for content_item in items {
if let FunctionCallOutputContentItem::InputImage { image_url } = content_item {

View file

@ -67,7 +67,7 @@ fn user_input_text_msg(text: &str) -> ResponseItem {
fn custom_tool_call_output(call_id: &str, output: &str) -> ResponseItem {
ResponseItem::CustomToolCallOutput {
call_id: call_id.to_string(),
output: output.to_string(),
output: FunctionCallOutputPayload::from_text(output.to_string()),
}
}
@ -279,6 +279,24 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
},
]),
},
ResponseItem::CustomToolCall {
id: None,
status: None,
call_id: "tool-1".to_string(),
name: "js_repl".to_string(),
input: "view_image".to_string(),
},
ResponseItem::CustomToolCallOutput {
call_id: "tool-1".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text: "js repl result".to_string(),
},
FunctionCallOutputContentItem::InputImage {
image_url: "https://example.com/js-repl-result.png".to_string(),
},
]),
},
];
let history = create_history_with_items(items);
let text_only_modalities = vec![InputModality::Text];
@ -321,6 +339,25 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
},
]),
},
ResponseItem::CustomToolCall {
id: None,
status: None,
call_id: "tool-1".to_string(),
name: "js_repl".to_string(),
input: "view_image".to_string(),
},
ResponseItem::CustomToolCallOutput {
call_id: "tool-1".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text: "js repl result".to_string(),
},
FunctionCallOutputContentItem::InputText {
text: "image content omitted because you do not support image input"
.to_string(),
},
]),
},
];
assert_eq!(stripped, expected);
@ -671,7 +708,7 @@ fn remove_first_item_handles_custom_tool_pair() {
},
ResponseItem::CustomToolCallOutput {
call_id: "tool-1".to_string(),
output: "ok".to_string(),
output: FunctionCallOutputPayload::from_text("ok".to_string()),
},
];
let mut h = create_history_with_items(items);
@ -750,7 +787,7 @@ fn record_items_truncates_custom_tool_call_output_content() {
let long_output = line.repeat(2_500);
let item = ResponseItem::CustomToolCallOutput {
call_id: "tool-200".to_string(),
output: long_output.clone(),
output: FunctionCallOutputPayload::from_text(long_output.clone()),
};
history.record_items([&item], policy);
@ -758,7 +795,8 @@ fn record_items_truncates_custom_tool_call_output_content() {
assert_eq!(history.items.len(), 1);
match &history.items[0] {
ResponseItem::CustomToolCallOutput { output, .. } => {
assert_ne!(output, &long_output);
let output = output.text_content().unwrap_or_default();
assert_ne!(output, long_output);
assert!(
output.contains("tokens truncated"),
"expected token-based truncation marker, got {output}"
@ -949,7 +987,7 @@ fn normalize_adds_missing_output_for_custom_tool_call() {
},
ResponseItem::CustomToolCallOutput {
call_id: "tool-x".to_string(),
output: "aborted".to_string(),
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
},
]
);
@ -1016,7 +1054,7 @@ fn normalize_removes_orphan_function_call_output() {
fn normalize_removes_orphan_custom_tool_call_output() {
let items = vec![ResponseItem::CustomToolCallOutput {
call_id: "orphan-2".to_string(),
output: "ok".to_string(),
output: FunctionCallOutputPayload::from_text("ok".to_string()),
}];
let mut h = create_history_with_items(items);
@ -1089,7 +1127,7 @@ fn normalize_mixed_inserts_and_removals() {
},
ResponseItem::CustomToolCallOutput {
call_id: "t1".to_string(),
output: "aborted".to_string(),
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
},
ResponseItem::LocalShellCall {
id: None,
@ -1191,7 +1229,7 @@ fn normalize_removes_orphan_function_call_output_panics_in_debug() {
fn normalize_removes_orphan_custom_tool_call_output_panics_in_debug() {
let items = vec![ResponseItem::CustomToolCallOutput {
call_id: "orphan-2".to_string(),
output: "ok".to_string(),
output: FunctionCallOutputPayload::from_text("ok".to_string()),
}];
let mut h = create_history_with_items(items);
h.normalize_history(&default_input_modalities());
@ -1294,6 +1332,28 @@ fn image_data_url_payload_does_not_dominate_function_call_output_estimate() {
assert!(estimated < raw_len);
}
#[test]
fn image_data_url_payload_does_not_dominate_custom_tool_call_output_estimate() {
let payload = "C".repeat(50_000);
let image_url = format!("data:image/png;base64,{payload}");
let item = ResponseItem::CustomToolCallOutput {
call_id: "call-js-repl".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text: "Screenshot captured".to_string(),
},
FunctionCallOutputContentItem::InputImage { image_url },
]),
};
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
let estimated = estimate_response_item_model_visible_bytes(&item);
let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
assert_eq!(estimated, expected);
assert!(estimated < raw_len);
}
#[test]
fn non_base64_image_urls_are_unchanged() {
let message_item = ResponseItem::Message {

View file

@ -1,7 +1,6 @@
use std::collections::HashSet;
use codex_protocol::models::ContentItem;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ResponseItem;
@ -35,10 +34,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
idx,
ResponseItem::FunctionCallOutput {
call_id: call_id.clone(),
output: FunctionCallOutputPayload {
body: FunctionCallOutputBody::Text("aborted".to_string()),
..Default::default()
},
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
},
));
}
@ -59,7 +55,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
idx,
ResponseItem::CustomToolCallOutput {
call_id: call_id.clone(),
output: "aborted".to_string(),
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
},
));
}
@ -82,10 +78,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
idx,
ResponseItem::FunctionCallOutput {
call_id: call_id.clone(),
output: FunctionCallOutputPayload {
body: FunctionCallOutputBody::Text("aborted".to_string()),
..Default::default()
},
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
},
));
}
@ -245,7 +238,8 @@ pub(crate) fn strip_images_when_unsupported(
}
*content = normalized_content;
}
ResponseItem::FunctionCallOutput { output, .. } => {
ResponseItem::FunctionCallOutput { output, .. }
| ResponseItem::CustomToolCallOutput { output, .. } => {
if let Some(content_items) = output.content_items_mut() {
let mut normalized_content_items = Vec::with_capacity(content_items.len());
for content_item in content_items.iter() {

View file

@ -95,15 +95,12 @@ impl ToolOutput {
match self {
ToolOutput::Function { body, success } => {
// `custom_tool_call` is the Responses API item type for freeform
// tools (`ToolSpec::Freeform`, e.g. freeform `apply_patch`).
// Those payloads must round-trip as `custom_tool_call_output`
// with plain string output.
// tools (`ToolSpec::Freeform`, e.g. freeform `apply_patch` or
// `js_repl`).
if matches!(payload, ToolPayload::Custom { .. }) {
// Freeform/custom tools (`custom_tool_call`) use the custom
// output wire shape and remain string-only.
return ResponseInputItem::CustomToolCallOutput {
call_id: call_id.to_string(),
output: body.to_text().unwrap_or_default(),
output: FunctionCallOutputPayload { body, success },
};
}
@ -183,7 +180,9 @@ mod tests {
match response {
ResponseInputItem::CustomToolCallOutput { call_id, output } => {
assert_eq!(call_id, "call-42");
assert_eq!(output, "patched");
assert_eq!(output.text_content(), Some("patched"));
assert!(output.content_items().is_none());
assert_eq!(output.success, Some(true));
}
other => panic!("expected CustomToolCallOutput, got {other:?}"),
}
@ -234,8 +233,21 @@ mod tests {
match response {
ResponseInputItem::CustomToolCallOutput { call_id, output } => {
let expected = vec![
FunctionCallOutputContentItem::InputText {
text: "line 1".to_string(),
},
FunctionCallOutputContentItem::InputImage {
image_url: "data:image/png;base64,AAA".to_string(),
},
FunctionCallOutputContentItem::InputText {
text: "line 2".to_string(),
},
];
assert_eq!(call_id, "call-99");
assert_eq!(output, "line 1\nline 2");
assert_eq!(output.content_items(), Some(expected.as_slice()));
assert_eq!(output.body.to_text().as_deref(), Some("line 1\nline 2"));
assert_eq!(output.success, Some(true));
}
other => panic!("expected CustomToolCallOutput, got {other:?}"),
}

View file

@ -155,9 +155,13 @@ impl ToolHandler for JsReplHandler {
};
let content = result.output;
let items = vec![FunctionCallOutputContentItem::InputText {
text: content.clone(),
}];
let mut items = Vec::with_capacity(result.content_items.len() + 1);
if !content.is_empty() {
items.push(FunctionCallOutputContentItem::InputText {
text: content.clone(),
});
}
items.extend(result.content_items);
emit_js_repl_exec_end(
session.as_ref(),
@ -170,7 +174,11 @@ impl ToolHandler for JsReplHandler {
.await;
Ok(ToolOutput::Function {
body: FunctionCallOutputBody::ContentItems(items),
body: if items.is_empty() {
FunctionCallOutputBody::Text(content)
} else {
FunctionCallOutputBody::ContentItems(items)
},
success: Some(true),
})
}

View file

@ -104,6 +104,7 @@ pub struct JsReplArgs {
#[derive(Clone, Debug)]
pub struct JsExecResult {
pub output: String,
pub content_items: Vec<FunctionCallOutputContentItem>,
}
struct KernelState {
@ -125,6 +126,7 @@ struct ExecContext {
#[derive(Default)]
struct ExecToolCalls {
in_flight: usize,
content_items: Vec<FunctionCallOutputContentItem>,
notify: Arc<Notify>,
cancel: CancellationToken,
}
@ -136,6 +138,7 @@ enum JsReplToolCallPayloadKind {
FunctionText,
FunctionContentItems,
CustomText,
CustomContentItems,
McpResult,
McpErrorResult,
Error,
@ -369,6 +372,21 @@ impl JsReplManager {
Some(state.cancel.clone())
}
async fn record_exec_tool_call_content_items(
exec_tool_calls: &Arc<Mutex<HashMap<String, ExecToolCalls>>>,
exec_id: &str,
content_items: Vec<FunctionCallOutputContentItem>,
) {
if content_items.is_empty() {
return;
}
let mut calls = exec_tool_calls.lock().await;
if let Some(state) = calls.get_mut(exec_id) {
state.content_items.extend(content_items);
}
}
async fn finish_exec_tool_call(
exec_tool_calls: &Arc<Mutex<HashMap<String, ExecToolCalls>>>,
exec_id: &str,
@ -592,11 +610,18 @@ impl JsReplManager {
output,
)
}
ResponseInputItem::CustomToolCallOutput { output, .. } => Self::summarize_text_payload(
Some("custom_tool_call_output"),
JsReplToolCallPayloadKind::CustomText,
output,
),
ResponseInputItem::CustomToolCallOutput { output, .. } => {
let payload_kind = if output.content_items().is_some() {
JsReplToolCallPayloadKind::CustomContentItems
} else {
JsReplToolCallPayloadKind::CustomText
};
Self::summarize_function_output_payload(
"custom_tool_call_output",
payload_kind,
output,
)
}
ResponseInputItem::McpToolCallOutput { result, .. } => match result {
Ok(result) => {
let output = FunctionCallOutputPayload::from(result);
@ -769,7 +794,13 @@ impl JsReplManager {
};
match response {
ExecResultMessage::Ok { output } => Ok(JsExecResult { output }),
ExecResultMessage::Ok { content_items } => {
let (output, content_items) = split_exec_result_content_items(content_items);
Ok(JsExecResult {
output,
content_items,
})
}
ExecResultMessage::Err { message } => Err(FunctionCallError::RespondToModel(message)),
}
}
@ -1073,10 +1104,22 @@ impl JsReplManager {
error,
} => {
JsReplManager::wait_for_exec_tool_calls_map(&exec_tool_calls, &id).await;
let content_items = {
let calls = exec_tool_calls.lock().await;
calls
.get(&id)
.map(|state| state.content_items.clone())
.unwrap_or_default()
};
let mut pending = pending_execs.lock().await;
if let Some(tx) = pending.remove(&id) {
let payload = if ok {
ExecResultMessage::Ok { output }
ExecResultMessage::Ok {
content_items: build_exec_result_content_items(
output,
content_items,
),
}
} else {
ExecResultMessage::Err {
message: error
@ -1133,7 +1176,11 @@ impl JsReplManager {
response: None,
error: Some("js_repl execution reset".to_string()),
},
result = JsReplManager::run_tool_request(ctx, req) => result,
result = JsReplManager::run_tool_request(
ctx,
req,
Arc::clone(&exec_tool_calls_for_task),
) => result,
}
}
None => RunToolResult {
@ -1227,7 +1274,11 @@ impl JsReplManager {
}
}
async fn run_tool_request(exec: ExecContext, req: RunToolRequest) -> RunToolResult {
async fn run_tool_request(
exec: ExecContext,
req: RunToolRequest,
exec_tool_calls: Arc<Mutex<HashMap<String, ExecToolCalls>>>,
) -> RunToolResult {
if is_js_repl_internal_tool(&req.tool_name) {
let error = "js_repl cannot invoke itself".to_string();
let summary = Self::summarize_tool_call_error(&error);
@ -1300,39 +1351,13 @@ impl JsReplManager {
.await
{
Ok(response) => {
if let ResponseInputItem::FunctionCallOutput { output, .. } = &response
&& let Some(items) = output.content_items()
{
let mut has_image = false;
let mut content = Vec::with_capacity(items.len());
for item in items {
match item {
FunctionCallOutputContentItem::InputText { text } => {
content.push(ContentItem::InputText { text: text.clone() });
}
FunctionCallOutputContentItem::InputImage { image_url } => {
has_image = true;
content.push(ContentItem::InputImage {
image_url: image_url.clone(),
});
}
}
}
if has_image
&& session
.inject_response_items(vec![ResponseInputItem::Message {
role: "user".to_string(),
content,
}])
.await
.is_err()
{
warn!(
tool_name = %tool_name,
"js_repl tool call returned image content but there was no active turn to attach it to"
);
}
if let Some(items) = response_content_items(&response) {
Self::record_exec_tool_call_content_items(
&exec_tool_calls,
&req.exec_id,
items,
)
.await;
}
let summary = Self::summarize_tool_call_response(&response);
@ -1407,6 +1432,50 @@ impl JsReplManager {
}
}
fn response_content_items(
response: &ResponseInputItem,
) -> Option<Vec<FunctionCallOutputContentItem>> {
match response {
ResponseInputItem::FunctionCallOutput { output, .. }
| ResponseInputItem::CustomToolCallOutput { output, .. } => output
.content_items()
.map(<[FunctionCallOutputContentItem]>::to_vec),
ResponseInputItem::McpToolCallOutput { result, .. } => match result {
Ok(result) => FunctionCallOutputPayload::from(result)
.content_items()
.map(<[FunctionCallOutputContentItem]>::to_vec),
Err(_) => None,
},
ResponseInputItem::Message { .. } => None,
}
}
fn build_exec_result_content_items(
output: String,
content_items: Vec<FunctionCallOutputContentItem>,
) -> Vec<FunctionCallOutputContentItem> {
let mut all_content_items = Vec::with_capacity(content_items.len() + 1);
all_content_items.push(FunctionCallOutputContentItem::InputText { text: output });
all_content_items.extend(content_items);
all_content_items
}
fn split_exec_result_content_items(
mut content_items: Vec<FunctionCallOutputContentItem>,
) -> (String, Vec<FunctionCallOutputContentItem>) {
match content_items.first() {
Some(FunctionCallOutputContentItem::InputText { .. }) => {
let FunctionCallOutputContentItem::InputText { text } = content_items.remove(0) else {
unreachable!("first content item should be input_text");
};
(text, content_items)
}
Some(FunctionCallOutputContentItem::InputImage { .. }) | None => {
(String::new(), content_items)
}
}
}
fn is_freeform_tool(specs: &[ToolSpec], name: &str) -> bool {
specs
.iter()
@ -1462,8 +1531,12 @@ struct RunToolResult {
#[derive(Debug)]
enum ExecResultMessage {
Ok { output: String },
Err { message: String },
Ok {
content_items: Vec<FunctionCallOutputContentItem>,
},
Err {
message: String,
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@ -1601,7 +1674,6 @@ mod tests {
use codex_protocol::dynamic_tools::DynamicToolCallOutputContentItem;
use codex_protocol::dynamic_tools::DynamicToolResponse;
use codex_protocol::dynamic_tools::DynamicToolSpec;
use codex_protocol::models::ContentItem;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ResponseInputItem;
@ -1850,6 +1922,35 @@ mod tests {
);
}
#[test]
fn summarize_tool_call_response_for_multimodal_custom_output() {
let response = ResponseInputItem::CustomToolCallOutput {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputImage {
image_url: "data:image/png;base64,abcd".to_string(),
},
]),
};
let actual = JsReplManager::summarize_tool_call_response(&response);
assert_eq!(
actual,
JsReplToolCallResponseSummary {
response_type: Some("custom_tool_call_output".to_string()),
payload_kind: Some(JsReplToolCallPayloadKind::CustomContentItems),
payload_text_preview: None,
payload_text_length: None,
payload_item_count: Some(1),
text_item_count: Some(0),
image_item_count: Some(1),
structured_content_present: None,
result_is_error: None,
}
);
}
#[test]
fn summarize_tool_call_error_marks_error_payload() {
let actual = JsReplManager::summarize_tool_call_error("tool failed");
@ -2310,20 +2411,22 @@ console.log(out.output?.body?.text ?? "");
)
.await?;
assert!(result.output.contains("function_call_output"));
let pending_input = session.get_pending_input().await;
let [ResponseInputItem::Message { role, content }] = pending_input.as_slice() else {
panic!(
"view_image should inject exactly one pending input message, got {pending_input:?}"
);
};
assert_eq!(role, "user");
let [ContentItem::InputImage { image_url }] = content.as_slice() else {
panic!(
"view_image should inject exactly one input_image content item, got {content:?}"
);
assert_eq!(
result.content_items.as_slice(),
[FunctionCallOutputContentItem::InputImage {
image_url:
"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg=="
.to_string(),
}]
.as_slice()
);
let [FunctionCallOutputContentItem::InputImage { image_url }] =
result.content_items.as_slice()
else {
panic!("view_image should return exactly one input_image content item");
};
assert!(image_url.starts_with("data:image/png;base64,"));
assert!(session.get_pending_input().await.is_empty());
Ok(())
}
@ -2404,22 +2507,18 @@ console.log(out.type);
response_watcher_result?;
let result = result?;
assert!(result.output.contains("function_call_output"));
let pending_input = session.get_pending_input().await;
assert_eq!(
pending_input,
vec![ResponseInputItem::Message {
role: "user".to_string(),
content: vec![
ContentItem::InputText {
text: "inline image note".to_string(),
},
ContentItem::InputImage {
image_url: image_url.to_string(),
},
],
}]
result.content_items,
vec![
FunctionCallOutputContentItem::InputText {
text: "inline image note".to_string(),
},
FunctionCallOutputContentItem::InputImage {
image_url: image_url.to_string(),
},
]
);
assert!(session.get_pending_input().await.is_empty());
Ok(())
}

View file

@ -117,7 +117,10 @@ impl ToolCallRuntime {
match &call.payload {
ToolPayload::Custom { .. } => ResponseInputItem::CustomToolCallOutput {
call_id: call.call_id.clone(),
output: Self::abort_message(call, secs),
output: FunctionCallOutputPayload {
body: FunctionCallOutputBody::Text(Self::abort_message(call, secs)),
..Default::default()
},
},
ToolPayload::Mcp { .. } => ResponseInputItem::McpToolCallOutput {
call_id: call.call_id.clone(),

View file

@ -197,7 +197,10 @@ impl ToolRouter {
if payload_outputs_custom {
ResponseInputItem::CustomToolCallOutput {
call_id,
output: message,
output: codex_protocol::models::FunctionCallOutputPayload {
body: FunctionCallOutputBody::Text(message),
success: Some(false),
},
}
} else {
ResponseInputItem::FunctionCallOutput {

View file

@ -264,7 +264,7 @@ impl ResponsesRequest {
.cloned()
.unwrap_or(Value::Null);
match output {
Value::String(text) => Some((Some(text), None)),
Value::String(_) | Value::Array(_) => Some((output_value_to_text(&output), None)),
Value::Object(obj) => Some((
obj.get("content")
.and_then(Value::as_str)
@ -296,6 +296,87 @@ impl ResponsesRequest {
}
}
pub(crate) fn output_value_to_text(value: &Value) -> Option<String> {
match value {
Value::String(text) => Some(text.clone()),
Value::Array(items) => match items.as_slice() {
[item] if item.get("type").and_then(Value::as_str) == Some("input_text") => {
item.get("text").and_then(Value::as_str).map(str::to_string)
}
[_] | [] | [_, _, ..] => None,
},
Value::Object(_) | Value::Number(_) | Value::Bool(_) | Value::Null => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use wiremock::http::HeaderMap;
use wiremock::http::Method;
fn request_with_input(input: Value) -> ResponsesRequest {
ResponsesRequest(wiremock::Request {
url: "http://localhost/v1/responses"
.parse()
.expect("valid request url"),
method: Method::POST,
headers: HeaderMap::new(),
body: serde_json::to_vec(&serde_json::json!({ "input": input }))
.expect("serialize request body"),
})
}
#[test]
fn call_output_content_and_success_returns_only_single_text_content_item() {
let single_text = request_with_input(serde_json::json!([
{
"type": "function_call_output",
"call_id": "call-1",
"output": [{ "type": "input_text", "text": "hello" }]
},
{
"type": "custom_tool_call_output",
"call_id": "call-2",
"output": [{ "type": "input_text", "text": "world" }]
}
]));
assert_eq!(
single_text.function_call_output_content_and_success("call-1"),
Some((Some("hello".to_string()), None))
);
assert_eq!(
single_text.custom_tool_call_output_content_and_success("call-2"),
Some((Some("world".to_string()), None))
);
let mixed_content = request_with_input(serde_json::json!([
{
"type": "function_call_output",
"call_id": "call-3",
"output": [
{ "type": "input_text", "text": "hello" },
{ "type": "input_image", "image_url": "data:image/png;base64,abc" }
]
},
{
"type": "custom_tool_call_output",
"call_id": "call-4",
"output": [{ "type": "input_image", "image_url": "data:image/png;base64,abc" }]
}
]));
assert_eq!(
mixed_content.function_call_output_content_and_success("call-3"),
Some((None, None))
);
assert_eq!(
mixed_content.custom_tool_call_output_content_and_success("call-4"),
Some((None, None))
);
}
}
#[derive(Debug, Clone)]
pub struct WebSocketRequest {
body: Value,

View file

@ -23,6 +23,7 @@ use wiremock::MockServer;
use crate::load_default_config_for_test;
use crate::responses::WebSocketTestServer;
use crate::responses::output_value_to_text;
use crate::responses::start_mock_server;
use crate::streaming_sse::StreamingSseServer;
use crate::wait_for_event;
@ -394,11 +395,7 @@ impl TestCodexHarness {
pub async fn custom_tool_call_output(&self, call_id: &str) -> String {
let bodies = self.request_bodies().await;
custom_tool_call_output(&bodies, call_id)
.get("output")
.and_then(Value::as_str)
.expect("output string")
.to_string()
custom_tool_call_output_text(&bodies, call_id)
}
pub async fn apply_patch_output(
@ -433,6 +430,14 @@ fn custom_tool_call_output<'a>(bodies: &'a [Value], call_id: &str) -> &'a Value
panic!("custom_tool_call_output {call_id} not found");
}
fn custom_tool_call_output_text(bodies: &[Value], call_id: &str) -> String {
let output = custom_tool_call_output(bodies, call_id)
.get("output")
.unwrap_or_else(|| panic!("custom_tool_call_output {call_id} missing output"));
output_value_to_text(output)
.unwrap_or_else(|| panic!("custom_tool_call_output {call_id} missing text output"))
}
fn function_call_output<'a>(bodies: &'a [Value], call_id: &str) -> &'a Value {
for body in bodies {
if let Some(items) = body.get("input").and_then(Value::as_array) {
@ -456,3 +461,36 @@ pub fn test_codex() -> TestCodexBuilder {
home: None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use serde_json::json;
#[test]
fn custom_tool_call_output_text_returns_output_text() {
let bodies = vec![json!({
"input": [{
"type": "custom_tool_call_output",
"call_id": "call-1",
"output": "hello"
}]
})];
assert_eq!(custom_tool_call_output_text(&bodies, "call-1"), "hello");
}
#[test]
#[should_panic(expected = "custom_tool_call_output call-2 missing output")]
fn custom_tool_call_output_text_panics_when_output_is_missing() {
let bodies = vec![json!({
"input": [{
"type": "custom_tool_call_output",
"call_id": "call-2"
}]
})];
let _ = custom_tool_call_output_text(&bodies, "call-2");
}
}

View file

@ -35,6 +35,10 @@ use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::openai_models::ReasoningEffort;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::Op;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::RolloutLine;
use codex_protocol::protocol::SessionMeta;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::SessionSource;
use codex_protocol::user_input::UserInput;
use core_test_support::apps_test_server::AppsTestServer;
@ -344,6 +348,144 @@ async fn resume_includes_initial_messages_and_sends_prior_items() {
assert!(pos_environment < pos_new_user);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn resume_replays_legacy_js_repl_image_rollout_shapes() {
skip_if_no_network!();
// Early js_repl builds persisted image tool results as two separate rollout items:
// a string-valued custom_tool_call_output plus a standalone user input_image message.
// Current image tests cover today's shapes; this keeps resume compatibility for that
// legacy rollout representation.
let legacy_custom_tool_call = ResponseItem::CustomToolCall {
id: None,
status: None,
call_id: "legacy-js-call".to_string(),
name: "js_repl".to_string(),
input: "console.log('legacy image flow')".to_string(),
};
let legacy_image_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
let rollout = vec![
RolloutLine {
timestamp: "2024-01-01T00:00:00.000Z".to_string(),
item: RolloutItem::SessionMeta(SessionMetaLine {
meta: SessionMeta {
id: ThreadId::default(),
timestamp: "2024-01-01T00:00:00Z".to_string(),
cwd: ".".into(),
originator: "test_originator".to_string(),
cli_version: "test_version".to_string(),
model_provider: Some("test-provider".to_string()),
..Default::default()
},
git: None,
}),
},
RolloutLine {
timestamp: "2024-01-01T00:00:01.000Z".to_string(),
item: RolloutItem::ResponseItem(legacy_custom_tool_call),
},
RolloutLine {
timestamp: "2024-01-01T00:00:02.000Z".to_string(),
item: RolloutItem::ResponseItem(ResponseItem::CustomToolCallOutput {
call_id: "legacy-js-call".to_string(),
output: FunctionCallOutputPayload::from_text("legacy js_repl stdout".to_string()),
}),
},
RolloutLine {
timestamp: "2024-01-01T00:00:03.000Z".to_string(),
item: RolloutItem::ResponseItem(ResponseItem::Message {
id: None,
role: "user".to_string(),
content: vec![ContentItem::InputImage {
image_url: legacy_image_url.to_string(),
}],
end_turn: None,
phase: None,
}),
},
];
let tmpdir = TempDir::new().unwrap();
let session_path = tmpdir
.path()
.join("resume-legacy-js-repl-image-rollout.jsonl");
let mut f = std::fs::File::create(&session_path).unwrap();
for line in rollout {
writeln!(f, "{}", serde_json::to_string(&line).unwrap()).unwrap();
}
let server = MockServer::start().await;
let resp_mock = mount_sse_once(
&server,
sse(vec![ev_response_created("resp1"), ev_completed("resp1")]),
)
.await;
let codex_home = Arc::new(TempDir::new().unwrap());
let mut builder = test_codex().with_model("gpt-5.1");
let test = builder
.resume(&server, codex_home, session_path.clone())
.await
.expect("resume conversation");
test.submit_turn("after resume").await.unwrap();
let input = resp_mock.single_request().input();
let legacy_output_index = input
.iter()
.position(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("custom_tool_call_output")
&& item.get("call_id").and_then(|value| value.as_str()) == Some("legacy-js-call")
})
.expect("legacy custom tool output should be replayed");
assert_eq!(
input[legacy_output_index]
.get("output")
.and_then(|value| value.as_str()),
Some("legacy js_repl stdout")
);
let legacy_image_index = input
.iter()
.position(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("message")
&& item.get("role").and_then(|value| value.as_str()) == Some("user")
&& item
.get("content")
.and_then(|value| value.as_array())
.is_some_and(|content| {
content.iter().any(|entry| {
entry.get("type").and_then(|value| value.as_str())
== Some("input_image")
&& entry.get("image_url").and_then(|value| value.as_str())
== Some(legacy_image_url)
})
})
})
.expect("legacy injected image message should be replayed");
let new_user_index = input
.iter()
.position(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("message")
&& item.get("role").and_then(|value| value.as_str()) == Some("user")
&& item
.get("content")
.and_then(|value| value.as_array())
.is_some_and(|content| {
content.iter().any(|entry| {
entry.get("type").and_then(|value| value.as_str()) == Some("input_text")
&& entry.get("text").and_then(|value| value.as_str())
== Some("after resume")
})
})
})
.expect("new user message should be present");
assert!(legacy_output_index < new_user_index);
assert!(legacy_image_index < new_user_index);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn includes_conversation_id_and_model_headers_in_request() {
skip_if_no_network!();
@ -1565,7 +1707,7 @@ async fn azure_responses_request_includes_store_and_reasoning_ids() {
});
prompt.input.push(ResponseItem::CustomToolCallOutput {
call_id: "custom-tool-call-id".into(),
output: "ok".into(),
output: FunctionCallOutputPayload::from_text("ok".into()),
});
let mut stream = client_session

View file

@ -358,40 +358,26 @@ console.log(out.output?.body?.text ?? "");
.await;
let req = mock.single_request();
let (js_repl_output, js_repl_success) = req
.custom_tool_call_output_content_and_success(call_id)
.expect("custom tool output present");
let js_repl_output = js_repl_output.expect("custom tool output text present");
assert_ne!(
js_repl_success,
Some(false),
"js_repl call failed unexpectedly: {js_repl_output}"
let body = req.body_json();
assert_eq!(
image_messages(&body).len(),
0,
"js_repl view_image should not inject a pending input image message"
);
let body = req.body_json();
let image_messages = image_messages(&body);
assert_eq!(
image_messages.len(),
1,
"js_repl view_image should inject exactly one pending input image message"
);
let image_message = image_messages
.into_iter()
.next()
.expect("pending input image message not included in request");
let image_url = image_message
.get("content")
let custom_output = req.custom_tool_call_output(call_id);
let output_items = custom_output
.get("output")
.and_then(Value::as_array)
.and_then(|content| {
content.iter().find_map(|span| {
if span.get("type").and_then(Value::as_str) == Some("input_image") {
span.get("image_url").and_then(Value::as_str)
} else {
None
}
})
.expect("custom_tool_call_output should be a content item array");
let image_url = output_items
.iter()
.find_map(|item| {
(item.get("type").and_then(Value::as_str) == Some("input_image"))
.then(|| item.get("image_url").and_then(Value::as_str))
.flatten()
})
.expect("image_url present");
.expect("image_url present in js_repl custom tool output");
assert!(
image_url.starts_with("data:image/png;base64,"),
"expected png data URL, got {image_url}"

View file

@ -161,7 +161,7 @@ pub enum ResponseInputItem {
},
CustomToolCallOutput {
call_id: String,
output: String,
output: FunctionCallOutputPayload,
},
}
@ -261,9 +261,12 @@ pub enum ResponseItem {
name: String,
input: String,
},
// `custom_tool_call_output.output` uses the same wire encoding as
// `function_call_output.output` so freeform tools can return either plain
// text or structured content items.
CustomToolCallOutput {
call_id: String,
output: String,
output: FunctionCallOutputPayload,
},
// Emitted by the Responses API when the agent triggers a web search.
// Example payload (from SSE `response.output_item.done`):
@ -1538,6 +1541,26 @@ mod tests {
Ok(())
}
#[test]
fn serializes_custom_tool_image_outputs_as_array() -> Result<()> {
let item = ResponseInputItem::CustomToolCallOutput {
call_id: "call1".into(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputImage {
image_url: "data:image/png;base64,BASE64".into(),
},
]),
};
let json = serde_json::to_string(&item)?;
let v: serde_json::Value = serde_json::from_str(&json)?;
let output = v.get("output").expect("output field");
assert!(output.is_array(), "expected array output");
Ok(())
}
#[test]
fn preserves_existing_image_data_urls() -> Result<()> {
let call_tool_result = CallToolResult {