image-gen-core (#13290)
Core tool-calling for image-gen, handles requesting and receiving logic for images using response API
This commit is contained in:
parent
4f6c4bb143
commit
fa2306b303
22 changed files with 766 additions and 45 deletions
|
|
@ -1416,6 +1416,40 @@
|
|||
"title": "WebSearchCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"result": {
|
||||
"type": "string"
|
||||
},
|
||||
"revised_prompt": {
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"image_generation_call"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItemType",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"result",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"ghost_commit": {
|
||||
|
|
|
|||
|
|
@ -4983,6 +4983,40 @@
|
|||
"title": "WebSearchCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"result": {
|
||||
"type": "string"
|
||||
},
|
||||
"revised_prompt": {
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"image_generation_call"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItemType",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"result",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"ghost_commit": {
|
||||
|
|
|
|||
|
|
@ -11761,6 +11761,40 @@
|
|||
"title": "WebSearchCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"result": {
|
||||
"type": "string"
|
||||
},
|
||||
"revised_prompt": {
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"image_generation_call"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItemType",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"result",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"ghost_commit": {
|
||||
|
|
|
|||
|
|
@ -9324,6 +9324,40 @@
|
|||
"title": "WebSearchCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"result": {
|
||||
"type": "string"
|
||||
},
|
||||
"revised_prompt": {
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"image_generation_call"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItemType",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"result",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"ghost_commit": {
|
||||
|
|
|
|||
|
|
@ -641,6 +641,40 @@
|
|||
"title": "WebSearchCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"result": {
|
||||
"type": "string"
|
||||
},
|
||||
"revised_prompt": {
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"image_generation_call"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItemType",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"result",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"ghost_commit": {
|
||||
|
|
|
|||
|
|
@ -691,6 +691,40 @@
|
|||
"title": "WebSearchCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"result": {
|
||||
"type": "string"
|
||||
},
|
||||
"revised_prompt": {
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"image_generation_call"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItemType",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"result",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "ImageGenerationCallResponseItem",
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"ghost_commit": {
|
||||
|
|
|
|||
|
|
@ -15,4 +15,4 @@ export type ResponseItem = { "type": "message", role: string, content: Array<Con
|
|||
/**
|
||||
* Set when using the Responses API.
|
||||
*/
|
||||
call_id: string | null, status: LocalShellStatus, action: LocalShellAction, } | { "type": "function_call", name: string, arguments: string, call_id: string, } | { "type": "function_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "custom_tool_call", status?: string, call_id: string, name: string, input: string, } | { "type": "custom_tool_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "web_search_call", status?: string, action?: WebSearchAction, } | { "type": "ghost_snapshot", ghost_commit: GhostCommit, } | { "type": "compaction", encrypted_content: string, } | { "type": "other" };
|
||||
call_id: string | null, status: LocalShellStatus, action: LocalShellAction, } | { "type": "function_call", name: string, arguments: string, call_id: string, } | { "type": "function_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "custom_tool_call", status?: string, call_id: string, name: string, input: string, } | { "type": "custom_tool_call_output", call_id: string, output: FunctionCallOutputPayload, } | { "type": "web_search_call", status?: string, action?: WebSearchAction, } | { "type": "image_generation_call", id: string, status: string, revised_prompt?: string, result: string, } | { "type": "ghost_snapshot", ghost_commit: GhostCommit, } | { "type": "compaction", encrypted_content: string, } | { "type": "other" };
|
||||
|
|
|
|||
|
|
@ -365,6 +365,9 @@
|
|||
"image_detail_original": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"image_generation": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"include_apply_patch_tool": {
|
||||
"type": "boolean"
|
||||
},
|
||||
|
|
@ -1753,6 +1756,9 @@
|
|||
"image_detail_original": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"image_generation": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"include_apply_patch_tool": {
|
||||
"type": "boolean"
|
||||
},
|
||||
|
|
|
|||
|
|
@ -567,7 +567,6 @@ impl ModelClientSession {
|
|||
) -> ApiResponsesOptions {
|
||||
let turn_metadata_header = parse_turn_metadata_header(turn_metadata_header);
|
||||
let conversation_id = self.client.state.conversation_id.to_string();
|
||||
|
||||
ApiResponsesOptions {
|
||||
conversation_id: Some(conversation_id),
|
||||
session_source: Some(self.client.state.session_source.clone()),
|
||||
|
|
|
|||
|
|
@ -166,6 +166,8 @@ pub(crate) mod tools {
|
|||
Function(ResponsesApiTool),
|
||||
#[serde(rename = "local_shell")]
|
||||
LocalShell {},
|
||||
#[serde(rename = "image_generation")]
|
||||
ImageGeneration {},
|
||||
// TODO: Understand why we get an error on web_search although the API docs say it's supported.
|
||||
// https://platform.openai.com/docs/guides/tools-web-search?api-mode=responses#:~:text=%7B%20type%3A%20%22web_search%22%20%7D%2C
|
||||
// The `external_web_access` field determines whether the web search is over cached or live content.
|
||||
|
|
@ -184,6 +186,7 @@ pub(crate) mod tools {
|
|||
match self {
|
||||
ToolSpec::Function(tool) => tool.name.as_str(),
|
||||
ToolSpec::LocalShell {} => "local_shell",
|
||||
ToolSpec::ImageGeneration {} => "image_generation",
|
||||
ToolSpec::WebSearch { .. } => "web_search",
|
||||
ToolSpec::Freeform(tool) => tool.name.as_str(),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -206,6 +206,7 @@ fn should_keep_compacted_history_item(item: &ResponseItem) -> bool {
|
|||
| ResponseItem::CustomToolCall { .. }
|
||||
| ResponseItem::CustomToolCallOutput { .. }
|
||||
| ResponseItem::WebSearchCall { .. }
|
||||
| ResponseItem::ImageGenerationCall { .. }
|
||||
| ResponseItem::GhostSnapshot { .. }
|
||||
| ResponseItem::Other => false,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -344,6 +344,9 @@ impl ContextManager {
|
|||
// all outputs must have a corresponding function/tool call
|
||||
normalize::remove_orphan_outputs(&mut self.items);
|
||||
|
||||
//rewrite image_gen_calls to messages to support stateless input
|
||||
normalize::rewrite_image_generation_calls_for_stateless_input(&mut self.items);
|
||||
|
||||
// strip images when model does not support them
|
||||
normalize::strip_images_when_unsupported(input_modalities, &mut self.items);
|
||||
}
|
||||
|
|
@ -374,6 +377,7 @@ impl ContextManager {
|
|||
| ResponseItem::LocalShellCall { .. }
|
||||
| ResponseItem::FunctionCall { .. }
|
||||
| ResponseItem::WebSearchCall { .. }
|
||||
| ResponseItem::ImageGenerationCall { .. }
|
||||
| ResponseItem::CustomToolCall { .. }
|
||||
| ResponseItem::Compaction { .. }
|
||||
| ResponseItem::GhostSnapshot { .. }
|
||||
|
|
@ -402,7 +406,8 @@ fn truncate_function_output_payload(
|
|||
}
|
||||
|
||||
/// API messages include every non-system item (user/assistant messages, reasoning,
|
||||
/// tool calls, tool outputs, shell calls, and web-search calls).
|
||||
/// tool calls, tool outputs, shell calls, web-search calls, and image-generation
|
||||
/// calls).
|
||||
fn is_api_message(message: &ResponseItem) -> bool {
|
||||
match message {
|
||||
ResponseItem::Message { role, .. } => role.as_str() != "system",
|
||||
|
|
@ -413,6 +418,7 @@ fn is_api_message(message: &ResponseItem) -> bool {
|
|||
| ResponseItem::LocalShellCall { .. }
|
||||
| ResponseItem::Reasoning { .. }
|
||||
| ResponseItem::WebSearchCall { .. }
|
||||
| ResponseItem::ImageGenerationCall { .. }
|
||||
| ResponseItem::Compaction { .. } => true,
|
||||
ResponseItem::GhostSnapshot { .. } => false,
|
||||
ResponseItem::Other => false,
|
||||
|
|
@ -600,6 +606,7 @@ fn is_model_generated_item(item: &ResponseItem) -> bool {
|
|||
ResponseItem::Reasoning { .. }
|
||||
| ResponseItem::FunctionCall { .. }
|
||||
| ResponseItem::WebSearchCall { .. }
|
||||
| ResponseItem::ImageGenerationCall { .. }
|
||||
| ResponseItem::CustomToolCall { .. }
|
||||
| ResponseItem::LocalShellCall { .. }
|
||||
| ResponseItem::Compaction { .. } => true,
|
||||
|
|
|
|||
|
|
@ -395,6 +395,97 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_prompt_rewrites_image_generation_calls_when_images_are_supported() {
|
||||
let history = create_history_with_items(vec![
|
||||
ResponseItem::ImageGenerationCall {
|
||||
id: "ig_123".to_string(),
|
||||
status: "generating".to_string(),
|
||||
revised_prompt: Some("lobster".to_string()),
|
||||
result: "Zm9v".to_string(),
|
||||
},
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputText {
|
||||
text: "hi".to_string(),
|
||||
}],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
},
|
||||
]);
|
||||
|
||||
assert_eq!(
|
||||
history.for_prompt(&default_input_modalities()),
|
||||
vec![
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputImage {
|
||||
image_url: "data:image/png;base64,Zm9v".to_string(),
|
||||
}],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
},
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputText {
|
||||
text: "hi".to_string(),
|
||||
}],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_prompt_rewrites_image_generation_calls_when_images_are_unsupported() {
|
||||
let history = create_history_with_items(vec![
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputText {
|
||||
text: "generate a lobster".to_string(),
|
||||
}],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
},
|
||||
ResponseItem::ImageGenerationCall {
|
||||
id: "ig_123".to_string(),
|
||||
status: "completed".to_string(),
|
||||
revised_prompt: Some("lobster".to_string()),
|
||||
result: "Zm9v".to_string(),
|
||||
},
|
||||
]);
|
||||
|
||||
assert_eq!(
|
||||
history.for_prompt(&[InputModality::Text]),
|
||||
vec![
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputText {
|
||||
text: "generate a lobster".to_string(),
|
||||
}],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
},
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputText {
|
||||
text: "image content omitted because you do not support image input"
|
||||
.to_string(),
|
||||
}],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
},
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_history_for_prompt_drops_ghost_commits() {
|
||||
let items = vec![ResponseItem::GhostSnapshot {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::openai_models::InputModality;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use crate::util::error_or_panic;
|
||||
use tracing::info;
|
||||
|
|
@ -211,6 +210,31 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rewrite_image_generation_calls_for_stateless_input(items: &mut Vec<ResponseItem>) {
|
||||
let original_items = std::mem::take(items);
|
||||
*items = original_items
|
||||
.into_iter()
|
||||
.map(|item| match item {
|
||||
ResponseItem::ImageGenerationCall { result, .. } => {
|
||||
let image_url = if result.starts_with("data:") {
|
||||
result
|
||||
} else {
|
||||
format!("data:image/png;base64,{result}")
|
||||
};
|
||||
|
||||
ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentItem::InputImage { image_url }],
|
||||
end_turn: None,
|
||||
phase: None,
|
||||
}
|
||||
}
|
||||
_ => item,
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
|
||||
/// Strip image content from messages and tool outputs when the model does not support images.
|
||||
/// When `input_modalities` contains `InputModality::Image`, no stripping is performed.
|
||||
pub(crate) fn strip_images_when_unsupported(
|
||||
|
|
|
|||
|
|
@ -131,6 +131,8 @@ pub enum Feature {
|
|||
Apps,
|
||||
/// Enable plugins.
|
||||
Plugins,
|
||||
/// Allow the model to invoke the built-in image generation tool.
|
||||
ImageGeneration,
|
||||
/// Route apps MCP calls through the configured gateway.
|
||||
AppsMcpGateway,
|
||||
/// Allow prompting and installing missing MCP dependencies.
|
||||
|
|
@ -649,6 +651,12 @@ pub const FEATURES: &[FeatureSpec] = &[
|
|||
stage: Stage::UnderDevelopment,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::ImageGeneration,
|
||||
key: "image_generation",
|
||||
stage: Stage::UnderDevelopment,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::AppsMcpGateway,
|
||||
key: "apps_mcp_gateway",
|
||||
|
|
@ -868,6 +876,12 @@ mod tests {
|
|||
assert_eq!(Feature::JsRepl.default_enabled(), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_generation_is_under_development() {
|
||||
assert_eq!(Feature::ImageGeneration.stage(), Stage::UnderDevelopment);
|
||||
assert_eq!(Feature::ImageGeneration.default_enabled(), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn collab_is_legacy_alias_for_multi_agent() {
|
||||
assert_eq!(feature_for_key("multi_agent"), Some(Feature::Collab));
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ pub(crate) fn should_persist_response_item(item: &ResponseItem) -> bool {
|
|||
| ResponseItem::CustomToolCall { .. }
|
||||
| ResponseItem::CustomToolCallOutput { .. }
|
||||
| ResponseItem::WebSearchCall { .. }
|
||||
| ResponseItem::ImageGenerationCall { .. }
|
||||
| ResponseItem::GhostSnapshot { .. }
|
||||
| ResponseItem::Compaction { .. } => true,
|
||||
ResponseItem::Other => false,
|
||||
|
|
@ -53,6 +54,7 @@ pub(crate) fn should_persist_response_item_for_memories(item: &ResponseItem) ->
|
|||
| ResponseItem::CustomToolCallOutput { .. }
|
||||
| ResponseItem::WebSearchCall { .. } => true,
|
||||
ResponseItem::Reasoning { .. }
|
||||
| ResponseItem::ImageGenerationCall { .. }
|
||||
| ResponseItem::GhostSnapshot { .. }
|
||||
| ResponseItem::Compaction { .. }
|
||||
| ResponseItem::Other => false,
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ use codex_protocol::dynamic_tools::DynamicToolSpec;
|
|||
use codex_protocol::models::VIEW_IMAGE_TOOL_NAME;
|
||||
use codex_protocol::openai_models::ApplyPatchToolType;
|
||||
use codex_protocol::openai_models::ConfigShellToolType;
|
||||
use codex_protocol::openai_models::InputModality;
|
||||
use codex_protocol::openai_models::ModelInfo;
|
||||
use codex_protocol::protocol::SessionSource;
|
||||
use codex_protocol::protocol::SubAgentSource;
|
||||
|
|
@ -35,7 +36,6 @@ use std::collections::HashMap;
|
|||
|
||||
const SEARCH_TOOL_BM25_DESCRIPTION_TEMPLATE: &str =
|
||||
include_str!("../../templates/search_tool/tool_description.md");
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum ShellCommandBackendConfig {
|
||||
Classic,
|
||||
|
|
@ -49,6 +49,7 @@ pub(crate) struct ToolsConfig {
|
|||
pub allow_login_shell: bool,
|
||||
pub apply_patch_tool_type: Option<ApplyPatchToolType>,
|
||||
pub web_search_mode: Option<WebSearchMode>,
|
||||
pub image_gen_tool: bool,
|
||||
pub agent_roles: BTreeMap<String, AgentRoleConfig>,
|
||||
pub search_tool: bool,
|
||||
pub request_permission_enabled: bool,
|
||||
|
|
@ -86,6 +87,8 @@ impl ToolsConfig {
|
|||
features.enabled(Feature::DefaultModeRequestUserInput);
|
||||
let include_search_tool = features.enabled(Feature::Apps);
|
||||
let include_artifact_tools = features.enabled(Feature::Artifact);
|
||||
let include_image_gen_tool =
|
||||
features.enabled(Feature::ImageGeneration) && supports_image_generation(model_info);
|
||||
let include_agent_jobs = include_collab_tools && features.enabled(Feature::Sqlite);
|
||||
let request_permission_enabled = features.enabled(Feature::RequestPermissions);
|
||||
let shell_command_backend =
|
||||
|
|
@ -135,6 +138,7 @@ impl ToolsConfig {
|
|||
allow_login_shell: true,
|
||||
apply_patch_tool_type,
|
||||
web_search_mode: *web_search_mode,
|
||||
image_gen_tool: include_image_gen_tool,
|
||||
agent_roles: BTreeMap::new(),
|
||||
search_tool: include_search_tool,
|
||||
request_permission_enabled,
|
||||
|
|
@ -160,6 +164,10 @@ impl ToolsConfig {
|
|||
}
|
||||
}
|
||||
|
||||
fn supports_image_generation(model_info: &ModelInfo) -> bool {
|
||||
model_info.input_modalities.contains(&InputModality::Image)
|
||||
}
|
||||
|
||||
/// Generic JSON‑Schema subset needed for our tool definitions
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(tag = "type", rename_all = "lowercase")]
|
||||
|
|
@ -1917,6 +1925,10 @@ pub(crate) fn build_specs(
|
|||
Some(WebSearchMode::Disabled) | None => {}
|
||||
}
|
||||
|
||||
if config.image_gen_tool {
|
||||
builder.push_spec(ToolSpec::ImageGeneration {});
|
||||
}
|
||||
|
||||
builder.push_spec_with_parallel_support(create_view_image_tool(), true);
|
||||
builder.register_handler("view_image", view_image_handler);
|
||||
|
||||
|
|
@ -1995,6 +2007,7 @@ mod tests {
|
|||
use crate::models_manager::manager::ModelsManager;
|
||||
use crate::models_manager::model_info::with_config_overrides;
|
||||
use crate::tools::registry::ConfiguredToolSpec;
|
||||
use codex_protocol::openai_models::InputModality;
|
||||
use codex_protocol::openai_models::ModelInfo;
|
||||
use codex_protocol::openai_models::ModelsResponse;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
|
@ -2047,6 +2060,7 @@ mod tests {
|
|||
match tool {
|
||||
ToolSpec::Function(ResponsesApiTool { name, .. }) => name,
|
||||
ToolSpec::LocalShell {} => "local_shell",
|
||||
ToolSpec::ImageGeneration {} => "image_generation",
|
||||
ToolSpec::WebSearch { .. } => "web_search",
|
||||
ToolSpec::Freeform(FreeformTool { name, .. }) => name,
|
||||
}
|
||||
|
|
@ -2125,7 +2139,10 @@ mod tests {
|
|||
ToolSpec::Function(ResponsesApiTool { parameters, .. }) => {
|
||||
strip_descriptions_schema(parameters);
|
||||
}
|
||||
ToolSpec::Freeform(_) | ToolSpec::LocalShell {} | ToolSpec::WebSearch { .. } => {}
|
||||
ToolSpec::Freeform(_)
|
||||
| ToolSpec::LocalShell {}
|
||||
| ToolSpec::ImageGeneration {}
|
||||
| ToolSpec::WebSearch { .. } => {}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2374,6 +2391,56 @@ mod tests {
|
|||
assert_contains_tool_names(&tools, &["js_repl", "js_repl_reset"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_generation_tools_require_feature_and_supported_model() {
|
||||
let config = test_config();
|
||||
let mut supported_model_info =
|
||||
ModelsManager::construct_model_info_offline_for_tests("gpt-5.2", &config);
|
||||
supported_model_info.slug = "custom/gpt-5.2-variant".to_string();
|
||||
let mut unsupported_model_info = supported_model_info.clone();
|
||||
unsupported_model_info.input_modalities = vec![InputModality::Text];
|
||||
let default_features = Features::with_defaults();
|
||||
let mut image_generation_features = default_features.clone();
|
||||
image_generation_features.enable(Feature::ImageGeneration);
|
||||
|
||||
let default_tools_config = ToolsConfig::new(&ToolsConfigParams {
|
||||
model_info: &supported_model_info,
|
||||
features: &default_features,
|
||||
web_search_mode: Some(WebSearchMode::Cached),
|
||||
session_source: SessionSource::Cli,
|
||||
});
|
||||
let (default_tools, _) = build_specs(&default_tools_config, None, None, &[]).build();
|
||||
assert!(
|
||||
!default_tools
|
||||
.iter()
|
||||
.any(|tool| tool.spec.name() == "image_generation"),
|
||||
"image_generation should be disabled by default"
|
||||
);
|
||||
|
||||
let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
|
||||
model_info: &supported_model_info,
|
||||
features: &image_generation_features,
|
||||
web_search_mode: Some(WebSearchMode::Cached),
|
||||
session_source: SessionSource::Cli,
|
||||
});
|
||||
let (supported_tools, _) = build_specs(&supported_tools_config, None, None, &[]).build();
|
||||
assert_contains_tool_names(&supported_tools, &["image_generation"]);
|
||||
|
||||
let tools_config = ToolsConfig::new(&ToolsConfigParams {
|
||||
model_info: &unsupported_model_info,
|
||||
features: &image_generation_features,
|
||||
web_search_mode: Some(WebSearchMode::Cached),
|
||||
session_source: SessionSource::Cli,
|
||||
});
|
||||
let (tools, _) = build_specs(&tools_config, None, None, &[]).build();
|
||||
assert!(
|
||||
!tools
|
||||
.iter()
|
||||
.any(|tool| tool.spec.name() == "image_generation"),
|
||||
"image_generation should be disabled for unsupported models"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn js_repl_freeform_grammar_blocks_common_non_js_prefixes() {
|
||||
let ToolSpec::Freeform(FreeformTool { format, .. }) = create_js_repl_tool() else {
|
||||
|
|
|
|||
|
|
@ -739,6 +739,24 @@ pub fn ev_web_search_call_done(id: &str, status: &str, query: &str) -> Value {
|
|||
})
|
||||
}
|
||||
|
||||
pub fn ev_image_generation_call(
|
||||
id: &str,
|
||||
status: &str,
|
||||
revised_prompt: &str,
|
||||
result: &str,
|
||||
) -> Value {
|
||||
serde_json::json!({
|
||||
"type": "response.output_item.done",
|
||||
"item": {
|
||||
"type": "image_generation_call",
|
||||
"id": id,
|
||||
"status": status,
|
||||
"revised_prompt": revised_prompt,
|
||||
"result": result,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn ev_function_call(call_id: &str, name: &str, arguments: &str) -> Value {
|
||||
serde_json::json!({
|
||||
"type": "response.output_item.done",
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ use codex_protocol::protocol::Op;
|
|||
use codex_protocol::protocol::SandboxPolicy;
|
||||
use codex_protocol::user_input::UserInput;
|
||||
use core_test_support::responses::ev_completed_with_tokens;
|
||||
use core_test_support::responses::ev_image_generation_call;
|
||||
use core_test_support::responses::ev_response_created;
|
||||
use core_test_support::responses::mount_models_once;
|
||||
use core_test_support::responses::mount_sse_once;
|
||||
|
|
@ -33,6 +34,47 @@ use core_test_support::wait_for_event;
|
|||
use pretty_assertions::assert_eq;
|
||||
use wiremock::MockServer;
|
||||
|
||||
fn test_model_info(
|
||||
slug: &str,
|
||||
display_name: &str,
|
||||
description: &str,
|
||||
input_modalities: Vec<InputModality>,
|
||||
) -> ModelInfo {
|
||||
ModelInfo {
|
||||
slug: slug.to_string(),
|
||||
display_name: display_name.to_string(),
|
||||
description: Some(description.to_string()),
|
||||
default_reasoning_level: Some(ReasoningEffort::Medium),
|
||||
supported_reasoning_levels: vec![ReasoningEffortPreset {
|
||||
effort: ReasoningEffort::Medium,
|
||||
description: ReasoningEffort::Medium.to_string(),
|
||||
}],
|
||||
shell_type: ConfigShellToolType::ShellCommand,
|
||||
visibility: ModelVisibility::List,
|
||||
supported_in_api: true,
|
||||
input_modalities,
|
||||
prefer_websockets: false,
|
||||
used_fallback_model_metadata: false,
|
||||
priority: 1,
|
||||
upgrade: None,
|
||||
base_instructions: "base instructions".to_string(),
|
||||
model_messages: None,
|
||||
supports_reasoning_summaries: false,
|
||||
default_reasoning_summary: ReasoningSummary::Auto,
|
||||
support_verbosity: false,
|
||||
default_verbosity: None,
|
||||
availability_nux: None,
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
experimental_supported_tools: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn model_change_appends_model_instructions_developer_message() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
|
@ -274,44 +316,18 @@ async fn model_change_from_image_to_text_strips_prior_image_content() -> Result<
|
|||
let server = MockServer::start().await;
|
||||
let image_model_slug = "test-image-model";
|
||||
let text_model_slug = "test-text-only-model";
|
||||
let image_model = ModelInfo {
|
||||
slug: image_model_slug.to_string(),
|
||||
display_name: "Test Image Model".to_string(),
|
||||
description: Some("supports image input".to_string()),
|
||||
default_reasoning_level: Some(ReasoningEffort::Medium),
|
||||
supported_reasoning_levels: vec![ReasoningEffortPreset {
|
||||
effort: ReasoningEffort::Medium,
|
||||
description: ReasoningEffort::Medium.to_string(),
|
||||
}],
|
||||
shell_type: ConfigShellToolType::ShellCommand,
|
||||
visibility: ModelVisibility::List,
|
||||
supported_in_api: true,
|
||||
input_modalities: default_input_modalities(),
|
||||
prefer_websockets: false,
|
||||
used_fallback_model_metadata: false,
|
||||
priority: 1,
|
||||
upgrade: None,
|
||||
base_instructions: "base instructions".to_string(),
|
||||
model_messages: None,
|
||||
supports_reasoning_summaries: false,
|
||||
default_reasoning_summary: ReasoningSummary::Auto,
|
||||
support_verbosity: false,
|
||||
default_verbosity: None,
|
||||
availability_nux: None,
|
||||
apply_patch_tool_type: None,
|
||||
truncation_policy: TruncationPolicyConfig::bytes(10_000),
|
||||
supports_parallel_tool_calls: false,
|
||||
supports_image_detail_original: false,
|
||||
context_window: Some(272_000),
|
||||
auto_compact_token_limit: None,
|
||||
effective_context_window_percent: 95,
|
||||
experimental_supported_tools: Vec::new(),
|
||||
};
|
||||
let mut text_model = image_model.clone();
|
||||
text_model.slug = text_model_slug.to_string();
|
||||
text_model.display_name = "Test Text Model".to_string();
|
||||
text_model.description = Some("text only".to_string());
|
||||
text_model.input_modalities = vec![InputModality::Text];
|
||||
let image_model = test_model_info(
|
||||
image_model_slug,
|
||||
"Test Image Model",
|
||||
"supports image input",
|
||||
default_input_modalities(),
|
||||
);
|
||||
let text_model = test_model_info(
|
||||
text_model_slug,
|
||||
"Test Text Model",
|
||||
"text only",
|
||||
vec![InputModality::Text],
|
||||
);
|
||||
mount_models_once(
|
||||
&server,
|
||||
ModelsResponse {
|
||||
|
|
@ -421,6 +437,213 @@ async fn model_change_from_image_to_text_strips_prior_image_content() -> Result<
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn generated_image_is_replayed_for_image_capable_models() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = MockServer::start().await;
|
||||
let image_model_slug = "test-image-model";
|
||||
let image_model = test_model_info(
|
||||
image_model_slug,
|
||||
"Test Image Model",
|
||||
"supports image input",
|
||||
default_input_modalities(),
|
||||
);
|
||||
mount_models_once(
|
||||
&server,
|
||||
ModelsResponse {
|
||||
models: vec![image_model],
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let responses = mount_sse_sequence(
|
||||
&server,
|
||||
vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_image_generation_call("ig_123", "completed", "lobster", "Zm9v"),
|
||||
ev_completed_with_tokens("resp-1", 10),
|
||||
]),
|
||||
sse_completed("resp-2"),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
let mut builder = test_codex()
|
||||
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
|
||||
.with_config(move |config| {
|
||||
config.model = Some(image_model_slug.to_string());
|
||||
});
|
||||
let test = builder.build(&server).await?;
|
||||
let models_manager = test.thread_manager.get_models_manager();
|
||||
let _ = models_manager
|
||||
.list_models(RefreshStrategy::OnlineIfUncached)
|
||||
.await;
|
||||
|
||||
test.codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "generate a lobster".to_string(),
|
||||
text_elements: Vec::new(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd_path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::new_read_only_policy(),
|
||||
model: image_model_slug.to_string(),
|
||||
effort: test.config.model_reasoning_effort,
|
||||
service_tier: None,
|
||||
summary: None,
|
||||
collaboration_mode: None,
|
||||
personality: None,
|
||||
})
|
||||
.await?;
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
|
||||
|
||||
test.codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "describe the generated image".to_string(),
|
||||
text_elements: Vec::new(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd_path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::new_read_only_policy(),
|
||||
model: image_model_slug.to_string(),
|
||||
effort: test.config.model_reasoning_effort,
|
||||
service_tier: None,
|
||||
summary: None,
|
||||
collaboration_mode: None,
|
||||
personality: None,
|
||||
})
|
||||
.await?;
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
|
||||
|
||||
let requests = responses.requests();
|
||||
assert_eq!(requests.len(), 2, "expected two model requests");
|
||||
|
||||
let second_request = requests.last().expect("expected second request");
|
||||
assert_eq!(
|
||||
second_request.message_input_image_urls("user"),
|
||||
vec!["data:image/png;base64,Zm9v".to_string()]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn model_change_from_generated_image_to_text_strips_prior_generated_image_content()
|
||||
-> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = MockServer::start().await;
|
||||
let image_model_slug = "test-image-model";
|
||||
let text_model_slug = "test-text-only-model";
|
||||
let image_model = test_model_info(
|
||||
image_model_slug,
|
||||
"Test Image Model",
|
||||
"supports image input",
|
||||
default_input_modalities(),
|
||||
);
|
||||
let text_model = test_model_info(
|
||||
text_model_slug,
|
||||
"Test Text Model",
|
||||
"text only",
|
||||
vec![InputModality::Text],
|
||||
);
|
||||
mount_models_once(
|
||||
&server,
|
||||
ModelsResponse {
|
||||
models: vec![image_model, text_model],
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let responses = mount_sse_sequence(
|
||||
&server,
|
||||
vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_image_generation_call("ig_123", "completed", "lobster", "Zm9v"),
|
||||
ev_completed_with_tokens("resp-1", 10),
|
||||
]),
|
||||
sse_completed("resp-2"),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
let mut builder = test_codex()
|
||||
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
|
||||
.with_config(move |config| {
|
||||
config.model = Some(image_model_slug.to_string());
|
||||
});
|
||||
let test = builder.build(&server).await?;
|
||||
let models_manager = test.thread_manager.get_models_manager();
|
||||
let _ = models_manager
|
||||
.list_models(RefreshStrategy::OnlineIfUncached)
|
||||
.await;
|
||||
|
||||
test.codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "generate a lobster".to_string(),
|
||||
text_elements: Vec::new(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd_path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::new_read_only_policy(),
|
||||
model: image_model_slug.to_string(),
|
||||
effort: test.config.model_reasoning_effort,
|
||||
service_tier: None,
|
||||
summary: None,
|
||||
collaboration_mode: None,
|
||||
personality: None,
|
||||
})
|
||||
.await?;
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
|
||||
|
||||
test.codex
|
||||
.submit(Op::UserTurn {
|
||||
items: vec![UserInput::Text {
|
||||
text: "describe the generated image".to_string(),
|
||||
text_elements: Vec::new(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd_path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::new_read_only_policy(),
|
||||
model: text_model_slug.to_string(),
|
||||
effort: test.config.model_reasoning_effort,
|
||||
service_tier: None,
|
||||
summary: None,
|
||||
collaboration_mode: None,
|
||||
personality: None,
|
||||
})
|
||||
.await?;
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
|
||||
|
||||
let requests = responses.requests();
|
||||
assert_eq!(requests.len(), 2, "expected two model requests");
|
||||
|
||||
let second_request = requests.last().expect("expected second request");
|
||||
assert!(
|
||||
second_request.message_input_image_urls("user").is_empty(),
|
||||
"second request should strip generated image content for text-only models"
|
||||
);
|
||||
assert!(
|
||||
second_request
|
||||
.message_input_texts("user")
|
||||
.iter()
|
||||
.any(|text| text == "image content omitted because you do not support image input"),
|
||||
"second request should include the image-omitted placeholder text"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn model_switch_to_smaller_model_updates_token_context_window() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
|
|
|||
|
|
@ -771,6 +771,7 @@ impl OtelManager {
|
|||
ResponseItem::CustomToolCall { .. } => "custom_tool_call".into(),
|
||||
ResponseItem::CustomToolCallOutput { .. } => "custom_tool_call_output".into(),
|
||||
ResponseItem::WebSearchCall { .. } => "web_search_call".into(),
|
||||
ResponseItem::ImageGenerationCall { .. } => "image_generation_call".into(),
|
||||
ResponseItem::GhostSnapshot { .. } => "ghost_snapshot".into(),
|
||||
ResponseItem::Compaction { .. } => "compaction".into(),
|
||||
ResponseItem::Other => "other".into(),
|
||||
|
|
|
|||
|
|
@ -313,6 +313,23 @@ pub enum ResponseItem {
|
|||
#[ts(optional)]
|
||||
action: Option<WebSearchAction>,
|
||||
},
|
||||
// Emitted by the Responses API when the agent triggers image generation.
|
||||
// Example payload:
|
||||
// {
|
||||
// "id":"ig_123",
|
||||
// "type":"image_generation_call",
|
||||
// "status":"completed",
|
||||
// "revised_prompt":"A gray tabby cat hugging an otter...",
|
||||
// "result":"..."
|
||||
// }
|
||||
ImageGenerationCall {
|
||||
id: String,
|
||||
status: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
#[ts(optional)]
|
||||
revised_prompt: Option<String>,
|
||||
result: String,
|
||||
},
|
||||
// Generated by the harness but considered exactly as a model response.
|
||||
GhostSnapshot {
|
||||
ghost_commit: GhostCommit,
|
||||
|
|
@ -1286,6 +1303,49 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn response_item_parses_image_generation_call() {
|
||||
let item = serde_json::from_value::<ResponseItem>(serde_json::json!({
|
||||
"id": "ig_123",
|
||||
"type": "image_generation_call",
|
||||
"status": "completed",
|
||||
"revised_prompt": "A small blue square",
|
||||
"result": "Zm9v",
|
||||
}))
|
||||
.expect("image generation item should deserialize");
|
||||
|
||||
assert_eq!(
|
||||
item,
|
||||
ResponseItem::ImageGenerationCall {
|
||||
id: "ig_123".to_string(),
|
||||
status: "completed".to_string(),
|
||||
revised_prompt: Some("A small blue square".to_string()),
|
||||
result: "Zm9v".to_string(),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn response_item_parses_image_generation_call_without_revised_prompt() {
|
||||
let item = serde_json::from_value::<ResponseItem>(serde_json::json!({
|
||||
"id": "ig_123",
|
||||
"type": "image_generation_call",
|
||||
"status": "completed",
|
||||
"result": "Zm9v",
|
||||
}))
|
||||
.expect("image generation item should deserialize");
|
||||
|
||||
assert_eq!(
|
||||
item,
|
||||
ResponseItem::ImageGenerationCall {
|
||||
id: "ig_123".to_string(),
|
||||
status: "completed".to_string(),
|
||||
revised_prompt: None,
|
||||
result: "Zm9v".to_string(),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn convert_mcp_content_to_items_builds_data_urls_when_missing_prefix() {
|
||||
let contents = vec![serde_json::json!({
|
||||
|
|
|
|||
|
|
@ -706,6 +706,7 @@ mod tests {
|
|||
"limit": 10000
|
||||
},
|
||||
"supports_parallel_tool_calls": false,
|
||||
"supports_image_detail_original": false,
|
||||
"context_window": null,
|
||||
"auto_compact_token_limit": null,
|
||||
"effective_context_window_percent": 95,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue