From ecb41fcb64f22445b051d3943fc970d2ce51d0cd Mon Sep 17 00:00:00 2001 From: Won Park Date: Thu, 28 May 2026 11:44:55 -0700 Subject: [PATCH] Add feature-gated standalone image generation extension (#24723) ## Why Add a standalone image generation path that can be exercised independently of hosted Responses image generation, while retaining the hosted tool as fallback unless the extension is actually available to the model. ## What changed - Added the `codex-image-generation-extension` crate with standalone generate/edit execution, prior-image selection for edits, model-visible image output, and local generated-image persistence. - Installed the extension in app-server behind the disabled-by-default `imagegenext` feature and backend eligibility checks. - Updated core tool planning so eligible `image_gen.imagegen` exposure replaces hosted `image_generation`, while unavailable configurations retain hosted fallback. - Added coverage for extension behavior, edit history reuse, feature gating, auth eligibility, and hosted-tool replacement. - The extension is installed through app-server only in this PR; other execution paths retain hosted image generation because hosted replacement occurs only when the standalone executor is actually registered and model-visible. - The initial extension contract intentionally fixes the image model to `gpt-image-2` and uses automatic image parameters. - Native generated-image history/card parity and rollout persistence cleanup are intentionally deferred follow-up work. ## Validation - `just test -p codex-image-generation-extension` - `just test -p codex-features` - `just test -p codex-core hosted_tools_follow_provider_auth_model_and_config_gates` - `just test -p codex-app-server` - `just fix -p codex-image-generation-extension -p codex-features -p codex-core -p codex-app-server` - `just fmt` - `just bazel-lock-update` - `just bazel-lock-check` --------- Co-authored-by: jif-oai --- codex-rs/Cargo.lock | 26 ++ codex-rs/Cargo.toml | 2 + codex-rs/app-server/Cargo.toml | 1 + codex-rs/app-server/src/extensions.rs | 3 +- codex-rs/core/config.schema.json | 6 + codex-rs/core/src/tools/spec_plan.rs | 39 +- codex-rs/core/src/tools/spec_plan_tests.rs | 10 + codex-rs/ext/image-generation/BUILD.bazel | 9 + codex-rs/ext/image-generation/Cargo.toml | 37 ++ .../image-generation/imagegen_description.md | 11 + codex-rs/ext/image-generation/src/backend.rs | 60 +++ .../ext/image-generation/src/extension.rs | 99 +++++ codex-rs/ext/image-generation/src/lib.rs | 8 + codex-rs/ext/image-generation/src/tests.rs | 341 +++++++++++++++ codex-rs/ext/image-generation/src/tool.rs | 395 ++++++++++++++++++ codex-rs/features/src/lib.rs | 8 + codex-rs/features/src/tests.rs | 7 + 17 files changed, 1056 insertions(+), 6 deletions(-) create mode 100644 codex-rs/ext/image-generation/BUILD.bazel create mode 100644 codex-rs/ext/image-generation/Cargo.toml create mode 100644 codex-rs/ext/image-generation/imagegen_description.md create mode 100644 codex-rs/ext/image-generation/src/backend.rs create mode 100644 codex-rs/ext/image-generation/src/extension.rs create mode 100644 codex-rs/ext/image-generation/src/lib.rs create mode 100644 codex-rs/ext/image-generation/src/tests.rs create mode 100644 codex-rs/ext/image-generation/src/tool.rs diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index aac0d6f90c..1951db642f 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1928,6 +1928,7 @@ dependencies = [ "codex-git-utils", "codex-guardian", "codex-hooks", + "codex-image-generation-extension", "codex-login", "codex-mcp", "codex-memories-extension", @@ -3039,6 +3040,31 @@ dependencies = [ "uuid", ] +[[package]] +name = "codex-image-generation-extension" +version = "0.0.0" +dependencies = [ + "async-trait", + "base64 0.22.1", + "codex-api", + "codex-core", + "codex-extension-api", + "codex-features", + "codex-login", + "codex-model-provider", + "codex-model-provider-info", + "codex-protocol", + "codex-tools", + "http 1.4.0", + "pretty_assertions", + "schemars 0.8.22", + "serde", + "serde_json", + "tempfile", + "tokio", + "tracing", +] + [[package]] name = "codex-install-context" version = "0.0.0" diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index b1f9e4c31a..808cdef1b9 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -47,6 +47,7 @@ members = [ "ext/extension-api", "ext/goal", "ext/guardian", + "ext/image-generation", "ext/memories", "ext/web-search", "external-agent-migration", @@ -165,6 +166,7 @@ codex-execpolicy = { path = "execpolicy" } codex-extension-api = { path = "ext/extension-api" } codex-goal-extension = { path = "ext/goal" } codex-guardian = { path = "ext/guardian" } +codex-image-generation-extension = { path = "ext/image-generation" } codex-external-agent-migration = { path = "external-agent-migration" } codex-external-agent-sessions = { path = "external-agent-sessions" } codex-experimental-api-macros = { path = "codex-experimental-api-macros" } diff --git a/codex-rs/app-server/Cargo.toml b/codex-rs/app-server/Cargo.toml index 09e3197b77..d1a69783ad 100644 --- a/codex-rs/app-server/Cargo.toml +++ b/codex-rs/app-server/Cargo.toml @@ -54,6 +54,7 @@ codex-backend-client = { workspace = true } codex-file-search = { workspace = true } codex-chatgpt = { workspace = true } codex-login = { workspace = true } +codex-image-generation-extension = { workspace = true } codex-memories-extension = { workspace = true } codex-web-search-extension = { workspace = true } codex-memories-write = { workspace = true } diff --git a/codex-rs/app-server/src/extensions.rs b/codex-rs/app-server/src/extensions.rs index 1246da7b37..7b2673ca06 100644 --- a/codex-rs/app-server/src/extensions.rs +++ b/codex-rs/app-server/src/extensions.rs @@ -31,7 +31,8 @@ where let mut builder = ExtensionRegistryBuilder::::with_event_sink(event_sink); codex_guardian::install(&mut builder, guardian_agent_spawner); codex_memories_extension::install(&mut builder, codex_otel::global()); - codex_web_search_extension::install(&mut builder, auth_manager); + codex_web_search_extension::install(&mut builder, auth_manager.clone()); + codex_image_generation_extension::install(&mut builder, auth_manager); Arc::new(builder.build()) } diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index ab855d733b..1dd6bda8a4 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -473,6 +473,9 @@ "image_generation": { "type": "boolean" }, + "imagegenext": { + "type": "boolean" + }, "in_app_browser": { "type": "boolean" }, @@ -4564,6 +4567,9 @@ "image_generation": { "type": "boolean" }, + "imagegenext": { + "type": "boolean" + }, "in_app_browser": { "type": "boolean" }, diff --git a/codex-rs/core/src/tools/spec_plan.rs b/codex-rs/core/src/tools/spec_plan.rs index 2610bee95c..bb7baf4bd8 100644 --- a/codex-rs/core/src/tools/spec_plan.rs +++ b/codex-rs/core/src/tools/spec_plan.rs @@ -85,6 +85,8 @@ use std::sync::Arc; use tracing::warn; const MULTI_AGENT_V2_NAMESPACE_DESCRIPTION: &str = "Tools for spawning and managing sub-agents."; +const IMAGE_GEN_NAMESPACE: &str = "image_gen"; +const IMAGEGEN_TOOL_NAME: &str = "imagegen"; type PlannedRuntime = Arc; @@ -257,7 +259,9 @@ fn hosted_model_tool_specs(context: &CoreToolPlanContext<'_>) -> Vec { }) { specs.push(web_search_tool); } - if image_generation_tool_enabled(turn_context) { + if image_generation_tool_enabled(turn_context) + && !standalone_image_generation_available(turn_context, context.extension_tool_executors) + { specs.push(create_image_generation_tool("png")); } specs @@ -316,21 +320,41 @@ fn agent_jobs_worker_tools_enabled(turn_context: &TurnContext) -> bool { } fn image_generation_tool_enabled(turn_context: &TurnContext) -> bool { + image_generation_runtime_enabled(turn_context) + && turn_context + .features + .get() + .enabled(Feature::ImageGeneration) +} + +fn image_generation_runtime_enabled(turn_context: &TurnContext) -> bool { turn_context .auth_manager .as_deref() .is_some_and(AuthManager::current_auth_uses_codex_backend) && turn_context.provider.capabilities().image_generation - && turn_context - .features - .get() - .enabled(Feature::ImageGeneration) && turn_context .model_info .input_modalities .contains(&InputModality::Image) } +fn standalone_image_generation_model_visible(turn_context: &TurnContext) -> bool { + image_generation_runtime_enabled(turn_context) + && turn_context.features.get().enabled(Feature::ImageGenExt) + && namespace_tools_enabled(turn_context) +} + +fn standalone_image_generation_available( + turn_context: &TurnContext, + extension_tools: &[Arc>], +) -> bool { + standalone_image_generation_model_visible(turn_context) + && extension_tools.iter().any(|executor| { + executor.tool_name() == ToolName::namespaced(IMAGE_GEN_NAMESPACE, IMAGEGEN_TOOL_NAME) + }) +} + fn wait_agent_timeout_options(turn_context: &TurnContext) -> WaitAgentTimeoutOptions { if multi_agent_v2_enabled(turn_context) { return WaitAgentTimeoutOptions { @@ -839,6 +863,11 @@ fn append_extension_tool_executors( for executor in executors.iter().cloned() { let tool_name = executor.tool_name(); + if tool_name == ToolName::namespaced(IMAGE_GEN_NAMESPACE, IMAGEGEN_TOOL_NAME) + && !standalone_image_generation_model_visible(turn_context) + { + continue; + } if !reserved_tool_names.insert(tool_name.clone()) { warn!("Skipping extension tool `{tool_name}`: tool already registered"); continue; diff --git a/codex-rs/core/src/tools/spec_plan_tests.rs b/codex-rs/core/src/tools/spec_plan_tests.rs index 94587803b8..a274f934c4 100644 --- a/codex-rs/core/src/tools/spec_plan_tests.rs +++ b/codex-rs/core/src/tools/spec_plan_tests.rs @@ -960,6 +960,16 @@ async fn hosted_tools_follow_provider_auth_model_and_config_gates() { .await; image_generation.assert_visible_contains(&["image_generation"]); + let extension_flag_without_imagegen_tool = probe(|turn| { + use_chatgpt_auth(turn); + set_feature(turn, Feature::ImageGeneration, /*enabled*/ true); + set_feature(turn, Feature::ImageGenExt, /*enabled*/ true); + turn.model_info.input_modalities = vec![InputModality::Image]; + }) + .await; + extension_flag_without_imagegen_tool.assert_visible_contains(&["image_generation"]); + extension_flag_without_imagegen_tool.assert_visible_lacks(&["image_gen"]); + let live_web_search = probe(|turn| { set_web_search_mode(turn, WebSearchMode::Live); turn.model_info.web_search_tool_type = WebSearchToolType::TextAndImage; diff --git a/codex-rs/ext/image-generation/BUILD.bazel b/codex-rs/ext/image-generation/BUILD.bazel new file mode 100644 index 0000000000..5ed05a5dc8 --- /dev/null +++ b/codex-rs/ext/image-generation/BUILD.bazel @@ -0,0 +1,9 @@ +load("//:defs.bzl", "codex_rust_crate") + +codex_rust_crate( + name = "image-generation", + crate_name = "codex_image_generation_extension", + compile_data = [ + "imagegen_description.md", + ], +) diff --git a/codex-rs/ext/image-generation/Cargo.toml b/codex-rs/ext/image-generation/Cargo.toml new file mode 100644 index 0000000000..d4f2cb50de --- /dev/null +++ b/codex-rs/ext/image-generation/Cargo.toml @@ -0,0 +1,37 @@ +[package] +edition.workspace = true +license.workspace = true +name = "codex-image-generation-extension" +version.workspace = true + +[lib] +name = "codex_image_generation_extension" +path = "src/lib.rs" +doctest = false + +[lints] +workspace = true + +[dependencies] +async-trait = { workspace = true } +base64 = { workspace = true } +codex-api = { workspace = true } +codex-core = { workspace = true } +codex-extension-api = { workspace = true } +codex-features = { workspace = true } +codex-login = { workspace = true } +codex-model-provider = { workspace = true } +codex-model-provider-info = { workspace = true } +codex-protocol = { workspace = true } +codex-tools = { workspace = true } +http = { workspace = true } +schemars = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +tokio = { workspace = true, features = ["fs"] } +tracing = { workspace = true } + +[dev-dependencies] +pretty_assertions = { workspace = true } +tempfile = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } diff --git a/codex-rs/ext/image-generation/imagegen_description.md b/codex-rs/ext/image-generation/imagegen_description.md new file mode 100644 index 0000000000..7ae6ecb5fa --- /dev/null +++ b/codex-rs/ext/image-generation/imagegen_description.md @@ -0,0 +1,11 @@ +The `image_gen.imagegen` tool enables image generation from descriptions and editing of existing images based on specific instructions. Use it when: + +- The user requests an image based on a scene description, such as a diagram, portrait, comic, meme, or any other visual. +- The user wants to modify an attached or previously generated image with specific changes, including adding or removing elements, altering colors, improving quality/resolution, or transforming the style (e.g., cartoon, oil painting). + +Guidelines: +- Set `action` to `generate` when the user asks for a brand new image. +- Set `action` to `edit` when the user asks to modify an existing image from the conversation history. +- Directly generate the image without reconfirmation or clarification. +- After each image generation, do not mention anything related to download. Do not summarize the image. Do not ask followup question. Do not say ANYTHING after you generate an image. +- Always use this tool for image editing unless the user explicitly requests otherwise. Do not use the `python` tool for image editing unless specifically instructed. diff --git a/codex-rs/ext/image-generation/src/backend.rs b/codex-rs/ext/image-generation/src/backend.rs new file mode 100644 index 0000000000..9b837772c7 --- /dev/null +++ b/codex-rs/ext/image-generation/src/backend.rs @@ -0,0 +1,60 @@ +use codex_api::ImageEditRequest; +use codex_api::ImageGenerationRequest; +use codex_api::ImageResponse; +use codex_api::ImagesClient; +use codex_api::ReqwestTransport; +use codex_login::default_client::build_reqwest_client; +use codex_model_provider::SharedModelProvider; +use http::HeaderMap; + +#[derive(Clone)] +pub(crate) struct CodexImagesBackend { + provider: SharedModelProvider, +} + +impl CodexImagesBackend { + /// Creates a backend that sends image requests through the active model provider. + pub(crate) fn new(provider: SharedModelProvider) -> Self { + Self { provider } + } + + /// Resolves the provider and auth required for the current image API request. + async fn client(&self) -> Result, String> { + let provider = self + .provider + .api_provider() + .await + .map_err(|err| err.to_string())?; + let auth = self + .provider + .api_auth() + .await + .map_err(|err| err.to_string())?; + Ok(ImagesClient::new( + ReqwestTransport::new(build_reqwest_client()), + provider, + auth, + )) + } + + /// Sends a standalone image generation request through the configured Images client. + pub(crate) async fn generate( + &self, + request: ImageGenerationRequest, + ) -> Result { + self.client() + .await? + .generate(&request, HeaderMap::new()) + .await + .map_err(|err| err.to_string()) + } + + /// Sends a standalone image edit request through the configured Images client. + pub(crate) async fn edit(&self, request: ImageEditRequest) -> Result { + self.client() + .await? + .edit(&request, HeaderMap::new()) + .await + .map_err(|err| err.to_string()) + } +} diff --git a/codex-rs/ext/image-generation/src/extension.rs b/codex-rs/ext/image-generation/src/extension.rs new file mode 100644 index 0000000000..8f0b09f8cb --- /dev/null +++ b/codex-rs/ext/image-generation/src/extension.rs @@ -0,0 +1,99 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use codex_core::config::Config; +use codex_extension_api::ConfigContributor; +use codex_extension_api::ExtensionData; +use codex_extension_api::ExtensionRegistryBuilder; +use codex_extension_api::ThreadLifecycleContributor; +use codex_extension_api::ThreadStartInput; +use codex_extension_api::ToolCall; +use codex_extension_api::ToolContributor; +use codex_extension_api::ToolExecutor; +use codex_features::Feature; +use codex_login::AuthManager; +use codex_model_provider::create_model_provider; +use codex_model_provider_info::ModelProviderInfo; + +use crate::backend::CodexImagesBackend; +use crate::tool::ImageGenerationTool; +use crate::tool::generated_image_output_dir; + +#[derive(Clone)] +struct ImageGenerationExtension { + auth_manager: Arc, +} + +#[derive(Clone)] +struct ImageGenerationExtensionConfig { + enabled: bool, + provider: ModelProviderInfo, + codex_home: PathBuf, +} + +impl From<&Config> for ImageGenerationExtensionConfig { + /// Resolves whether standalone image generation should be available for a thread. + fn from(config: &Config) -> Self { + Self { + enabled: config.features.enabled(Feature::ImageGenExt) + && config.model_provider.is_openai(), + provider: config.model_provider.clone(), + codex_home: config.codex_home.to_path_buf(), + } + } +} + +#[async_trait::async_trait] +impl ThreadLifecycleContributor for ImageGenerationExtension { + /// Seeds image-generation availability when a thread begins. + async fn on_thread_start(&self, input: ThreadStartInput<'_, Config>) { + input + .thread_store + .insert(ImageGenerationExtensionConfig::from(input.config)); + } +} + +impl ConfigContributor for ImageGenerationExtension { + /// Refreshes image-generation availability after thread configuration changes. + fn on_config_changed( + &self, + _session_store: &ExtensionData, + thread_store: &ExtensionData, + _previous_config: &Config, + new_config: &Config, + ) { + thread_store.insert(ImageGenerationExtensionConfig::from(new_config)); + } +} + +impl ToolContributor for ImageGenerationExtension { + /// Creates the image-generation tool exposed by this installed extension. + fn tools( + &self, + _session_store: &ExtensionData, + thread_store: &ExtensionData, + ) -> Vec>> { + let Some(config) = thread_store.get::() else { + return Vec::new(); + }; + if !config.enabled || !self.auth_manager.current_auth_uses_codex_backend() { + return Vec::new(); + } + + vec![Arc::new(ImageGenerationTool::new( + CodexImagesBackend::new(create_model_provider( + config.provider.clone(), + Some(self.auth_manager.clone()), + )), + generated_image_output_dir(&config.codex_home, thread_store.level_id()), + ))] + } +} + +/// Installs the feature-gated standalone image-generation extension contributors. +pub fn install(registry: &mut ExtensionRegistryBuilder, auth_manager: Arc) { + let extension = Arc::new(ImageGenerationExtension { auth_manager }); + registry.thread_lifecycle_contributor(extension.clone()); + registry.config_contributor(extension.clone()); + registry.tool_contributor(extension); +} diff --git a/codex-rs/ext/image-generation/src/lib.rs b/codex-rs/ext/image-generation/src/lib.rs new file mode 100644 index 0000000000..63f1ab2482 --- /dev/null +++ b/codex-rs/ext/image-generation/src/lib.rs @@ -0,0 +1,8 @@ +mod backend; +mod extension; +mod tool; + +pub use extension::install; + +pub(crate) const IMAGE_GEN_NAMESPACE: &str = "image_gen"; +pub(crate) const IMAGEGEN_TOOL_NAME: &str = "imagegen"; diff --git a/codex-rs/ext/image-generation/src/tests.rs b/codex-rs/ext/image-generation/src/tests.rs new file mode 100644 index 0000000000..1b56270ed3 --- /dev/null +++ b/codex-rs/ext/image-generation/src/tests.rs @@ -0,0 +1,341 @@ +use codex_api::ImageBackground; +use codex_api::ImageEditRequest; +use codex_api::ImageGenerationRequest; +use codex_api::ImageQuality; +use codex_api::ImageUrl; +use codex_extension_api::ToolOutput; +use codex_extension_api::ToolPayload; +use codex_extension_api::ToolSpec; +use codex_protocol::models::ContentItem; +use codex_protocol::models::DEFAULT_IMAGE_DETAIL; +use codex_protocol::models::FunctionCallOutputBody; +use codex_protocol::models::FunctionCallOutputContentItem; +use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ResponseInputItem; +use codex_protocol::models::ResponseItem; +use codex_tools::ResponsesApiNamespaceTool; +use pretty_assertions::assert_eq; + +use super::GeneratedImageOutput; +use super::ImageRequest; +use super::ImagegenAction; +use super::ImagegenArgs; +use super::generated_image_output_dir; +use super::imagegen_tool_spec; +use super::persist_generated_image; +use super::request_for_action; +use crate::IMAGE_GEN_NAMESPACE; +use crate::IMAGEGEN_TOOL_NAME; + +const RESULT: &str = "cG5n"; + +#[test] +fn uses_reserved_image_gen_namespace() { + let ToolSpec::Namespace(spec) = imagegen_tool_spec() else { + panic!("imagegen should advertise a namespace tool"); + }; + assert_eq!(spec.name, IMAGE_GEN_NAMESPACE); + let ResponsesApiNamespaceTool::Function(function) = &spec.tools[0]; + assert_eq!(function.name, IMAGEGEN_TOOL_NAME); +} + +#[test] +fn generate_uses_fixed_request_defaults() { + assert_eq!( + request_for_action(&args(ImagegenAction::Generate, "paint a moonlit lake"), &[]) + .expect("generation request should build"), + ImageRequest::Generate(ImageGenerationRequest { + prompt: "paint a moonlit lake".to_string(), + background: Some(ImageBackground::Auto), + model: "gpt-image-2".to_string(), + n: None, + quality: Some(ImageQuality::Auto), + size: Some("auto".to_string()), + }) + ); +} + +#[tokio::test] +async fn generated_output_returns_image_input_and_persists_artifact() { + let tempdir = tempfile::tempdir().expect("tempdir"); + let output_hint = persist_generated_image(tempdir.path(), "call-1", RESULT) + .await + .expect("generated image should persist"); + let output = GeneratedImageOutput { + result: RESULT.to_string(), + output_hint: Some(output_hint), + }; + + let ResponseInputItem::FunctionCallOutput { + output: response_output, + .. + } = output.to_response_item("call-1", &function_payload()) + else { + panic!("imagegen should return function tool output"); + }; + let FunctionCallOutputBody::ContentItems(content_items) = response_output.body else { + panic!("imagegen output should contain generated image bytes"); + }; + assert_eq!( + content_items, + vec![ + FunctionCallOutputContentItem::InputImage { + image_url: format!("data:image/png;base64,{RESULT}"), + detail: Some(DEFAULT_IMAGE_DETAIL), + }, + FunctionCallOutputContentItem::InputText { + text: format!( + "Generated images are saved to {} as {} by default.\n\ + If you need to use a generated image at another path, copy it and leave the original in place unless the user explicitly asks you to delete it.", + tempdir.path().display(), + tempdir.path().join("call-1.png").display(), + ), + }, + ] + ); + assert_eq!( + std::fs::read(tempdir.path().join("call-1.png")).expect("saved generated image"), + b"png" + ); +} + +#[test] +fn edit_matches_context_selector_for_generated_images_after_latest_user_anchor() { + let history = vec![ + generated_item("g1"), + generated_item("g2"), + generated_item("g3"), + ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ + ContentItem::InputImage { + image_url: "data:image/png;base64,u1".to_string(), + detail: None, + }, + ContentItem::InputImage { + image_url: "data:image/png;base64,u2".to_string(), + detail: None, + }, + ], + phase: None, + }, + generated_item("g4"), + generated_item("g5"), + generated_item("g6"), + generated_item("g7"), + ]; + + assert_eq!( + edit_request("change the lighting", &history), + expected_edit_request( + "change the lighting", + &[ + "data:image/png;base64,u1", + "data:image/png;base64,u2", + "data:image/png;base64,g5", + "data:image/png;base64,g6", + "data:image/png;base64,g7", + ] + ) + ); +} + +#[test] +fn edit_preserves_a_generated_image_when_user_anchor_fills_the_limit() { + let history = vec![ + ResponseItem::Message { + id: None, + role: "user".to_string(), + content: ["a", "b", "c", "d", "e"] + .into_iter() + .map(|image| ContentItem::InputImage { + image_url: format!("data:image/png;base64,{image}"), + detail: None, + }) + .collect(), + phase: None, + }, + generated_item("generated"), + ]; + + assert_eq!( + edit_request("edit the last generated image", &history), + expected_edit_request( + "edit the last generated image", + &[ + "data:image/png;base64,b", + "data:image/png;base64,c", + "data:image/png;base64,d", + "data:image/png;base64,e", + "data:image/png;base64,generated", + ] + ) + ); +} + +#[test] +fn edit_uses_latest_user_upload_before_a_text_only_follow_up() { + let history = vec![ + ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ContentItem::InputImage { + image_url: "data:image/png;base64,user".to_string(), + detail: None, + }], + phase: None, + }, + ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ContentItem::InputText { + text: "edit this image".to_string(), + }], + phase: None, + }, + ]; + + assert_eq!( + edit_request("change the lighting", &history), + expected_edit_request("change the lighting", &["data:image/png;base64,user"]) + ); +} + +#[test] +fn edit_reuses_images_from_prior_standalone_imagegen_calls() { + let history = vec![ + ResponseItem::FunctionCall { + id: None, + name: IMAGEGEN_TOOL_NAME.to_string(), + namespace: Some(IMAGE_GEN_NAMESPACE.to_string()), + arguments: "{}".to_string(), + call_id: "imagegen-1".to_string(), + }, + generated_function_output("imagegen-1", "standalone"), + ]; + + assert_eq!( + edit_request("change the lighting", &history), + expected_edit_request("change the lighting", &["data:image/png;base64,standalone"]) + ); +} + +#[test] +fn edit_keeps_newest_standalone_generated_images_when_over_limit() { + let history = (1..=6) + .flat_map(|index| { + let call_id = format!("imagegen-{index}"); + vec![ + ResponseItem::FunctionCall { + id: None, + name: IMAGEGEN_TOOL_NAME.to_string(), + namespace: Some(IMAGE_GEN_NAMESPACE.to_string()), + arguments: "{}".to_string(), + call_id: call_id.clone(), + }, + generated_function_output(&call_id, &index.to_string()), + ] + }) + .collect::>(); + + assert_eq!( + edit_request("change the lighting", &history), + expected_edit_request( + "change the lighting", + &[ + "data:image/png;base64,2", + "data:image/png;base64,3", + "data:image/png;base64,4", + "data:image/png;base64,5", + "data:image/png;base64,6", + ] + ) + ); +} + +#[test] +fn edit_without_image_history_returns_tool_error() { + let error = request_for_action(&args(ImagegenAction::Edit, "change the lighting"), &[]) + .expect_err("edit should require image context"); + + assert_eq!( + error.to_string(), + "image edit requested without any usable image in conversation history" + ); +} + +#[test] +fn generated_image_output_dir_is_scoped_to_sanitized_thread_id() { + assert_eq!( + generated_image_output_dir(std::path::Path::new("/tmp/codex-home"), "thread/1"), + std::path::PathBuf::from("/tmp/codex-home/generated_images/thread_1") + ); +} + +fn args(action: ImagegenAction, prompt: &str) -> ImagegenArgs { + ImagegenArgs { + prompt: prompt.to_string(), + action, + } +} + +fn edit_request(prompt: &str, history: &[ResponseItem]) -> ImageEditRequest { + let ImageRequest::Edit(request) = + request_for_action(&args(ImagegenAction::Edit, prompt), history) + .expect("edit request should build") + else { + panic!("expected edit request"); + }; + request +} + +fn expected_edit_request(prompt: &str, images: &[&str]) -> ImageEditRequest { + ImageEditRequest { + images: images + .iter() + .map(|image_url| ImageUrl { + image_url: (*image_url).to_string(), + }) + .collect(), + prompt: prompt.to_string(), + background: Some(ImageBackground::Auto), + model: "gpt-image-2".to_string(), + n: None, + quality: Some(ImageQuality::Auto), + size: Some("auto".to_string()), + } +} + +fn generated_item(result: &str) -> ResponseItem { + ResponseItem::ImageGenerationCall { + id: format!("id-{result}"), + status: "completed".to_string(), + revised_prompt: None, + result: result.to_string(), + } +} + +fn generated_function_output(call_id: &str, result: &str) -> ResponseItem { + ResponseItem::FunctionCallOutput { + call_id: call_id.to_string(), + output: FunctionCallOutputPayload { + body: FunctionCallOutputBody::ContentItems(vec![ + FunctionCallOutputContentItem::InputImage { + image_url: format!("data:image/png;base64,{result}"), + detail: Some(DEFAULT_IMAGE_DETAIL), + }, + FunctionCallOutputContentItem::InputText { + text: "generated image save hint".to_string(), + }, + ]), + success: Some(true), + }, + } +} + +fn function_payload() -> ToolPayload { + ToolPayload::Function { + arguments: "{}".to_string(), + } +} diff --git a/codex-rs/ext/image-generation/src/tool.rs b/codex-rs/ext/image-generation/src/tool.rs new file mode 100644 index 0000000000..fa1614cd47 --- /dev/null +++ b/codex-rs/ext/image-generation/src/tool.rs @@ -0,0 +1,395 @@ +use std::path::Path; +use std::path::PathBuf; + +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; +use codex_api::ImageBackground; +use codex_api::ImageEditRequest; +use codex_api::ImageGenerationRequest; +use codex_api::ImageQuality; +use codex_api::ImageUrl; +use codex_extension_api::FunctionCallError; +use codex_extension_api::ToolCall; +use codex_extension_api::ToolExecutor; +use codex_extension_api::ToolName; +use codex_extension_api::ToolOutput; +use codex_extension_api::ToolPayload; +use codex_extension_api::ToolSpec; +use codex_extension_api::parse_tool_input_schema; +use codex_protocol::models::ContentItem; +use codex_protocol::models::DEFAULT_IMAGE_DETAIL; +use codex_protocol::models::FunctionCallOutputBody; +use codex_protocol::models::FunctionCallOutputContentItem; +use codex_protocol::models::FunctionCallOutputPayload; +use codex_protocol::models::ResponseInputItem; +use codex_protocol::models::ResponseItem; +use codex_tools::ResponsesApiNamespace; +use codex_tools::ResponsesApiNamespaceTool; +use codex_tools::ResponsesApiTool; +use codex_tools::ToolExposure; +use codex_tools::default_namespace_description; +use schemars::JsonSchema; +use schemars::r#gen::SchemaSettings; +use serde::Deserialize; +use serde_json::Map; +use serde_json::Value; + +use crate::IMAGE_GEN_NAMESPACE; +use crate::IMAGEGEN_TOOL_NAME; +use crate::backend::CodexImagesBackend; + +const IMAGE_MODEL: &str = "gpt-image-2"; +const MAX_EDIT_IMAGES: usize = 5; +const IMAGEGEN_DESCRIPTION: &str = include_str!("../imagegen_description.md"); +const GENERATED_IMAGE_ARTIFACTS_DIR: &str = "generated_images"; + +#[derive(Clone)] +pub(crate) struct ImageGenerationTool { + backend: CodexImagesBackend, + output_dir: PathBuf, +} + +impl ImageGenerationTool { + /// Creates an image-generation tool backed by an image API executor. + pub(crate) fn new(backend: CodexImagesBackend, output_dir: PathBuf) -> Self { + Self { + backend, + output_dir, + } + } +} + +#[derive(Debug, Deserialize, JsonSchema)] +#[serde(deny_unknown_fields)] +struct ImagegenArgs { + prompt: String, + action: ImagegenAction, +} + +#[derive(Debug, Deserialize, JsonSchema)] +#[serde(rename_all = "lowercase")] +enum ImagegenAction { + Generate, + Edit, +} + +#[async_trait::async_trait] +impl ToolExecutor for ImageGenerationTool { + /// Keeps the tool in the existing image-generation Responses namespace. + fn tool_name(&self) -> ToolName { + ToolName::namespaced(IMAGE_GEN_NAMESPACE, IMAGEGEN_TOOL_NAME) + } + + /// Advertises the model contract: a rewritten prompt and semantic action. + fn spec(&self) -> ToolSpec { + imagegen_tool_spec() + } + + /// Keeps this model-facing tool out of the nested code-mode tool surface. + fn exposure(&self) -> ToolExposure { + ToolExposure::DirectModelOnly + } + + /// Executes the selected image operation and returns the completed image result. + async fn handle(&self, call: ToolCall) -> Result, FunctionCallError> { + let args = parse_args(&call)?; + let request = request_for_action(&args, call.conversation_history.items())?; + + let response = match request { + ImageRequest::Generate(request) => self.backend.generate(request).await, + ImageRequest::Edit(request) => self.backend.edit(request).await, + } + .map_err(|err| { + FunctionCallError::RespondToModel(format!("image generation failed: {err}")) + })?; + let Some(result) = response.data.into_iter().next().map(|data| data.b64_json) else { + return Err(FunctionCallError::RespondToModel( + "image generation returned no image data".to_string(), + )); + }; + let output_hint = + match persist_generated_image(&self.output_dir, &call.call_id, &result).await { + Ok(output_hint) => Some(output_hint), + Err(err) => { + tracing::warn!( + call_id = %call.call_id, + output_dir = %self.output_dir.display(), + "failed to save generated image: {err}" + ); + None + } + }; + Ok(Box::new(GeneratedImageOutput { + result, + output_hint, + })) + } +} + +#[derive(Debug, PartialEq)] +enum ImageRequest { + Generate(ImageGenerationRequest), + Edit(ImageEditRequest), +} + +/// Maps the model-selected action to the fixed image API request parameters. +fn request_for_action( + args: &ImagegenArgs, + history: &[ResponseItem], +) -> Result { + match args.action { + ImagegenAction::Generate => Ok(ImageRequest::Generate(ImageGenerationRequest { + prompt: args.prompt.clone(), + background: Some(ImageBackground::Auto), + model: IMAGE_MODEL.to_string(), + n: None, + quality: Some(ImageQuality::Auto), + size: Some("auto".to_string()), + })), + ImagegenAction::Edit => { + let images = edit_images(history); + if images.is_empty() { + return Err(FunctionCallError::RespondToModel( + "image edit requested without any usable image in conversation history" + .to_string(), + )); + } + Ok(ImageRequest::Edit(ImageEditRequest { + images, + prompt: args.prompt.clone(), + background: Some(ImageBackground::Auto), + model: IMAGE_MODEL.to_string(), + n: None, + quality: Some(ImageQuality::Auto), + size: Some("auto".to_string()), + })) + } + } +} + +/// Selects edit context using the hosted imagegen anchor and truncation behavior. +fn edit_images(history: &[ResponseItem]) -> Vec { + let latest_uploaded_images = history.iter().enumerate().rev().find_map(|(index, item)| { + let ResponseItem::Message { role, content, .. } = item else { + return None; + }; + if role != "user" { + return None; + } + let images = content + .iter() + .filter_map(|item| match item { + ContentItem::InputImage { image_url, .. } => Some(ImageUrl { + image_url: image_url.clone(), + }), + ContentItem::InputText { .. } | ContentItem::OutputText { .. } => None, + }) + .collect::>(); + (!images.is_empty()).then_some((index, images)) + }); + let (user_images, follow_up_start) = latest_uploaded_images + .map_or_else(|| (Vec::new(), 0), |(index, images)| (images, index + 1)); + let mut generated_images = Vec::new(); + for item in &history[follow_up_start..] { + match item { + ResponseItem::ImageGenerationCall { result, .. } if !result.is_empty() => { + generated_images.push(ImageUrl { + image_url: format!("data:image/png;base64,{result}"), + }); + } + ResponseItem::FunctionCallOutput { call_id, output } + if history.iter().any(|item| { + matches!( + item, + ResponseItem::FunctionCall { + name, + namespace: Some(namespace), + call_id: function_call_id, + .. + } if function_call_id == call_id + && name == IMAGEGEN_TOOL_NAME + && namespace == IMAGE_GEN_NAMESPACE + ) + }) => + { + generated_images.extend(output.content_items().into_iter().flatten().filter_map( + |item| match item { + FunctionCallOutputContentItem::InputImage { image_url, .. } => { + Some(ImageUrl { + image_url: image_url.clone(), + }) + } + FunctionCallOutputContentItem::InputText { .. } + | FunctionCallOutputContentItem::EncryptedContent { .. } => None, + }, + )); + } + ResponseItem::Message { .. } + | ResponseItem::Reasoning { .. } + | ResponseItem::LocalShellCall { .. } + | ResponseItem::FunctionCall { .. } + | ResponseItem::ToolSearchCall { .. } + | ResponseItem::FunctionCallOutput { .. } + | ResponseItem::CustomToolCall { .. } + | ResponseItem::CustomToolCallOutput { .. } + | ResponseItem::ToolSearchOutput { .. } + | ResponseItem::WebSearchCall { .. } + | ResponseItem::ImageGenerationCall { .. } + | ResponseItem::Compaction { .. } + | ResponseItem::CompactionTrigger + | ResponseItem::ContextCompaction { .. } + | ResponseItem::Other => {} + } + } + truncate_images(user_images, generated_images) +} + +/// Truncates edit inputs while preserving the newest generated image when possible. +fn truncate_images( + mut user_images: Vec, + mut generated_images: Vec, +) -> Vec { + let mut excess = (user_images.len() + generated_images.len()).saturating_sub(MAX_EDIT_IMAGES); + let drop_generated = excess.min(generated_images.len().saturating_sub(1)); + generated_images.drain(..drop_generated); + excess -= drop_generated; + let drop_user = excess.min(user_images.len()); + user_images.drain(..drop_user); + excess -= drop_user; + generated_images.drain(..excess); + + user_images.extend(generated_images); + user_images +} + +/// Parses the strict model-facing arguments for an image-generation call. +fn parse_args(call: &ToolCall) -> Result { + serde_json::from_str(call.function_arguments()?) + .map_err(|err| FunctionCallError::RespondToModel(err.to_string())) +} + +/// Resolves where generated images for one thread are persisted by the extension. +pub(crate) fn generated_image_output_dir(codex_home: &Path, thread_id: &str) -> PathBuf { + codex_home + .join(GENERATED_IMAGE_ARTIFACTS_DIR) + .join(sanitize_path_component(thread_id)) +} + +fn generated_image_output_path(output_dir: &Path, call_id: &str) -> PathBuf { + output_dir.join(format!("{}.png", sanitize_path_component(call_id))) +} + +fn sanitize_path_component(value: &str) -> String { + let sanitized: String = value + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { + ch + } else { + '_' + } + }) + .collect(); + if sanitized.is_empty() { + "generated_image".to_string() + } else { + sanitized + } +} + +async fn persist_generated_image( + output_dir: &Path, + call_id: &str, + result: &str, +) -> Result { + let bytes = BASE64_STANDARD + .decode(result.trim().as_bytes()) + .map_err(|err| format!("invalid image generation payload: {err}"))?; + tokio::fs::create_dir_all(output_dir) + .await + .map_err(|err| err.to_string())?; + tokio::fs::write(generated_image_output_path(output_dir, call_id), bytes) + .await + .map_err(|err| err.to_string())?; + + Ok(format!( + "Generated images are saved to {} as {} by default.\n\ + If you need to use a generated image at another path, copy it and leave the original in place unless the user explicitly asks you to delete it.", + output_dir.display(), + generated_image_output_path(output_dir, call_id).display(), + )) +} + +/// Builds the namespace function schema exposed to the model. +fn imagegen_tool_spec() -> ToolSpec { + let mut schema_value = serde_json::to_value( + SchemaSettings::draft2019_09() + .with(|settings| settings.inline_subschemas = true) + .into_generator() + .into_root_schema_for::(), + ) + .unwrap_or_else(|err| panic!("imagegen schema should serialize: {err}")); + let Value::Object(ref mut schema) = schema_value else { + unreachable!("imagegen root schema must be an object"); + }; + let mut input_schema = Map::new(); + for key in ["properties", "required", "type", "additionalProperties"] { + if let Some(value) = schema.remove(key) { + input_schema.insert(key.to_string(), value); + } + } + ToolSpec::Namespace(ResponsesApiNamespace { + name: IMAGE_GEN_NAMESPACE.to_string(), + description: default_namespace_description(IMAGE_GEN_NAMESPACE), + tools: vec![ResponsesApiNamespaceTool::Function(ResponsesApiTool { + name: IMAGEGEN_TOOL_NAME.to_string(), + description: IMAGEGEN_DESCRIPTION.to_string(), + strict: false, + parameters: parse_tool_input_schema(&Value::Object(input_schema)) + .unwrap_or_else(|err| panic!("imagegen input schema should parse: {err}")), + output_schema: None, + defer_loading: None, + })], + }) +} + +struct GeneratedImageOutput { + result: String, + output_hint: Option, +} + +impl ToolOutput for GeneratedImageOutput { + /// Avoids copying image bytes into tool-call telemetry. + fn log_preview(&self) -> String { + "[generated image]".to_string() + } + + /// Reports a completed images request as successful tool execution. + fn success_for_logging(&self) -> bool { + true + } + + /// Returns generated bytes and persisted-artifact context for the model's follow-up response. + fn to_response_item(&self, call_id: &str, _payload: &ToolPayload) -> ResponseInputItem { + let mut content = vec![FunctionCallOutputContentItem::InputImage { + image_url: format!("data:image/png;base64,{}", self.result), + detail: Some(DEFAULT_IMAGE_DETAIL), + }]; + if let Some(output_hint) = &self.output_hint { + content.push(FunctionCallOutputContentItem::InputText { + text: output_hint.clone(), + }); + } + ResponseInputItem::FunctionCallOutput { + call_id: call_id.to_string(), + output: FunctionCallOutputPayload { + body: FunctionCallOutputBody::ContentItems(content), + success: Some(true), + }, + } + } +} + +#[cfg(test)] +#[path = "tests.rs"] +mod tests; diff --git a/codex-rs/features/src/lib.rs b/codex-rs/features/src/lib.rs index 1b0bd15486..116db0a5b5 100644 --- a/codex-rs/features/src/lib.rs +++ b/codex-rs/features/src/lib.rs @@ -170,6 +170,8 @@ pub enum Feature { ExternalMigration, /// Allow the model to invoke the built-in image generation tool. ImageGeneration, + /// Replace hosted image generation with the standalone image-generation extension. + ImageGenExt, /// Allow prompting and installing missing MCP dependencies. SkillMcpDependencyInstall, /// Removed compatibility flag for deleted skill env var dependency prompting. @@ -1053,6 +1055,12 @@ pub const FEATURES: &[FeatureSpec] = &[ stage: Stage::Stable, default_enabled: true, }, + FeatureSpec { + id: Feature::ImageGenExt, + key: "imagegenext", + stage: Stage::UnderDevelopment, + default_enabled: false, + }, FeatureSpec { id: Feature::SkillMcpDependencyInstall, key: "skill_mcp_dependency_install", diff --git a/codex-rs/features/src/tests.rs b/codex-rs/features/src/tests.rs index b8cd2fd5c6..5d7087e90b 100644 --- a/codex-rs/features/src/tests.rs +++ b/codex-rs/features/src/tests.rs @@ -244,6 +244,13 @@ fn image_generation_is_stable_and_enabled_by_default() { assert_eq!(Feature::ImageGeneration.default_enabled(), true); } +#[test] +fn image_generation_extension_is_under_development_and_disabled_by_default() { + assert_eq!(Feature::ImageGenExt.stage(), Stage::UnderDevelopment); + assert_eq!(Feature::ImageGenExt.default_enabled(), false); + assert_eq!(feature_for_key("imagegenext"), Some(Feature::ImageGenExt)); +} + #[test] fn use_legacy_landlock_config_records_deprecation_notice() { let mut entries = BTreeMap::new();