Route extension image generation through the native image completion pipeline (#24972)

## Why

The standalone `image_gen.imagegen` extension should behave like native
image generation for artifact persistence and UI completion, while
returning its save-location guidance as part of the tool result instead
of injecting a developer message.

## What Changed

- Added an image-generation completion hook for extension tools so core
can persist generated images and emit the existing `ImageGeneration`
lifecycle events.
- Reused core image artifact persistence for extension output and
removed extension-local save-path/file-writing logic.
- Split shared image persistence from built-in finalization so native
image generation keeps its existing developer-message instruction
behavior.
- Returned the generated image save-location instruction through the
extension `FunctionCallOutput`, alongside the generated image input for
model follow-up.
- Preserved the existing image-generation event shape for current UI and
replay compatibility.
- Avoided cloning the full generated-image base64 payload when emitting
the in-progress image item.
- Removed dependencies no longer needed after moving persistence out of
the extension crate.

## Fast Follow
- Adjust the existing Extension API and add a general `TurnItem`
finalization path for re-usability of code

## Validation

- Ran `just fmt`.
- Ran `just bazel-lock-update`.
- Ran `just bazel-lock-check`.
- Ran `just test -p codex-tools -p codex-extension-api -p
codex-image-generation-extension`.
- Ran `just test -p codex-core
image_generation_publication_is_finalized_by_core`.
- Ran `just test -p codex-core
handle_output_item_done_records_image_save_history_message`.
- Ran `just fix -p codex-tools -p codex-extension-api -p codex-core -p
codex-image-generation-extension`.
This commit is contained in:
Won Park
2026-05-29 10:33:13 -07:00
committed by GitHub
parent 3e666dd32a
commit 10b0399034
10 changed files with 272 additions and 168 deletions

4
codex-rs/Cargo.lock generated
View File

@@ -3033,7 +3033,6 @@ name = "codex-image-generation-extension"
version = "0.0.0"
dependencies = [
"async-trait",
"base64 0.22.1",
"codex-api",
"codex-core",
"codex-extension-api",
@@ -3048,9 +3047,6 @@ dependencies = [
"schemars 0.8.22",
"serde",
"serde_json",
"tempfile",
"tokio",
"tracing",
]
[[package]]

View File

@@ -5,6 +5,7 @@ use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_extension_api::ExtensionData;
use codex_protocol::config_types::ModeKind;
use codex_protocol::items::ImageGenerationItem;
use codex_protocol::items::TurnItem;
use codex_utils_stream_parser::strip_citations;
use tokio_util::sync::CancellationToken;
@@ -125,6 +126,68 @@ async fn save_image_generation_result(
Ok(path)
}
pub(crate) async fn persist_image_generation_item(
sess: &Session,
turn_context: &TurnContext,
image_item: &mut ImageGenerationItem,
) -> Option<AbsolutePathBuf> {
let session_id = sess.conversation_id.to_string();
match save_image_generation_result(
&turn_context.config.codex_home,
&session_id,
&image_item.id,
&image_item.result,
)
.await
{
Ok(path) => {
image_item.saved_path = Some(path.clone());
Some(path)
}
Err(err) => {
let output_path = image_generation_artifact_path(
&turn_context.config.codex_home,
&session_id,
&image_item.id,
);
let output_dir = output_path
.parent()
.unwrap_or_else(|| turn_context.config.codex_home.clone());
tracing::warn!(
call_id = %image_item.id,
output_dir = %output_dir.display(),
"failed to save generated image: {err}"
);
None
}
}
}
pub(crate) async fn finalize_image_generation_item(
sess: &Session,
turn_context: &TurnContext,
image_item: &mut ImageGenerationItem,
) {
if persist_image_generation_item(sess, turn_context, image_item)
.await
.is_none()
{
return;
}
let session_id = sess.conversation_id.to_string();
let image_output_path =
image_generation_artifact_path(&turn_context.config.codex_home, &session_id, "<image_id>");
let image_output_dir = image_output_path
.parent()
.unwrap_or_else(|| turn_context.config.codex_home.clone());
let message: ResponseItem = ContextualUserFragment::into(ImageGenerationInstructions::new(
image_output_dir.display(),
image_output_path.display(),
));
sess.record_conversation_items(turn_context, &[message])
.await;
}
/// Persist a completed model response item and record any cited memory usage.
pub(crate) async fn record_completed_response_item(
sess: &Session,
@@ -487,49 +550,7 @@ pub(crate) async fn handle_non_tool_response_item(
}
}
if let TurnItem::ImageGeneration(image_item) = &mut turn_item {
let session_id = sess.conversation_id.to_string();
match save_image_generation_result(
&turn_context.config.codex_home,
&session_id,
&image_item.id,
&image_item.result,
)
.await
{
Ok(path) => {
image_item.saved_path = Some(path);
let image_output_path = image_generation_artifact_path(
&turn_context.config.codex_home,
&session_id,
"<image_id>",
);
let image_output_dir = image_output_path
.parent()
.unwrap_or_else(|| turn_context.config.codex_home.clone());
let message: ResponseItem =
ContextualUserFragment::into(ImageGenerationInstructions::new(
image_output_dir.display(),
image_output_path.display(),
));
sess.record_conversation_items(turn_context, &[message])
.await;
}
Err(err) => {
let output_path = image_generation_artifact_path(
&turn_context.config.codex_home,
&session_id,
&image_item.id,
);
let output_dir = output_path
.parent()
.unwrap_or_else(|| turn_context.config.codex_home.clone());
tracing::warn!(
call_id = %image_item.id,
output_dir = %output_dir.display(),
"failed to save generated image: {err}"
);
}
}
finalize_image_generation_item(sess, turn_context, image_item).await;
}
Some(turn_item)
}

View File

@@ -4,15 +4,19 @@ use std::sync::Weak;
use codex_protocol::items::TurnItem;
use codex_tools::ConversationHistory;
use codex_tools::ExtensionTurnItem;
use codex_tools::ImageGenerationCompletionFuture;
use codex_tools::ToolCall as ExtensionToolCall;
use codex_tools::ToolName;
use codex_tools::ToolSpec;
use codex_tools::TurnItemEmissionFuture;
use codex_tools::TurnItemEmitter;
use crate::context::ContextualUserFragment;
use crate::context::ImageGenerationInstructions;
use crate::function_tool::FunctionCallError;
use crate::session::session::Session;
use crate::session::turn_context::TurnContext;
use crate::stream_events_utils::persist_image_generation_item;
use crate::tools::context::ToolInvocation;
use crate::tools::context::ToolOutput;
use crate::tools::context::ToolPayload;
@@ -90,6 +94,50 @@ impl TurnItemEmitter for CoreTurnItemEmitter {
session.emit_turn_item_completed(turn.as_ref(), item).await;
})
}
fn image_generation_completed<'a>(
&'a self,
call_id: String,
prompt: String,
result: String,
) -> ImageGenerationCompletionFuture<'a> {
Box::pin(async move {
let (Some(session), Some(turn)) = (self.session.upgrade(), self.turn.upgrade()) else {
return None;
};
let mut item = codex_protocol::items::ImageGenerationItem {
id: call_id,
status: "completed".to_string(),
revised_prompt: Some(prompt),
result,
saved_path: None,
};
let output_hint =
persist_image_generation_item(session.as_ref(), turn.as_ref(), &mut item)
.await
.map(|saved_path| {
let output_dir = saved_path
.parent()
.unwrap_or_else(|| turn.config.codex_home.clone());
ImageGenerationInstructions::new(output_dir.display(), saved_path.display())
.body()
});
let started_item = codex_protocol::items::ImageGenerationItem {
id: item.id.clone(),
status: "in_progress".to_string(),
revised_prompt: None,
result: String::new(),
saved_path: None,
};
session
.emit_turn_item_started(turn.as_ref(), &TurnItem::ImageGeneration(started_item))
.await;
session
.emit_turn_item_completed(turn.as_ref(), TurnItem::ImageGeneration(item))
.await;
output_hint
})
}
}
async fn to_extension_call(invocation: &ToolInvocation) -> ExtensionToolCall {
@@ -352,4 +400,130 @@ mod tests {
assert_eq!(end.query, expected.query);
assert_eq!(end.action, expected.action);
}
struct ImageGenerationExtensionExecutor {
output_hint: Arc<Mutex<Option<String>>>,
}
#[async_trait::async_trait]
impl codex_extension_api::ToolExecutor<codex_tools::ToolCall> for ImageGenerationExtensionExecutor {
fn tool_name(&self) -> codex_tools::ToolName {
codex_tools::ToolName::namespaced("image_gen", "imagegen")
}
fn spec(&self) -> codex_tools::ToolSpec {
codex_tools::ToolSpec::Function(codex_tools::ResponsesApiTool {
name: "imagegen".to_string(),
description: "Generates an image.".to_string(),
strict: false,
parameters: codex_tools::JsonSchema::default(),
output_schema: None,
defer_loading: None,
})
}
async fn handle(
&self,
call: codex_tools::ToolCall,
) -> Result<Box<dyn codex_tools::ToolOutput>, codex_tools::FunctionCallError> {
let output_hint = call
.turn_item_emitter
.image_generation_completed(
call.call_id,
"A tiny blue square".to_string(),
"cG5n".to_string(),
)
.await;
*self.output_hint.lock().await = output_hint;
Ok(Box::new(codex_tools::JsonToolOutput::new(
json!({ "ok": true }),
)))
}
}
#[tokio::test]
async fn image_generation_publication_is_finalized_by_core() {
let output_hint = Arc::new(Mutex::new(None));
let handler = ExtensionToolAdapter::new(Arc::new(ImageGenerationExtensionExecutor {
output_hint: Arc::clone(&output_hint),
}));
let (session, turn, rx) = crate::session::tests::make_session_and_context_with_rx().await;
let expected_path = crate::stream_events_utils::image_generation_artifact_path(
&turn.config.codex_home,
&session.conversation_id.to_string(),
"call-image",
);
let invocation = ToolInvocation {
session,
turn,
cancellation_token: tokio_util::sync::CancellationToken::new(),
tracker: Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new())),
call_id: "call-image".to_string(),
tool_name: codex_tools::ToolName::namespaced("image_gen", "imagegen"),
source: ToolCallSource::Direct,
payload: ToolPayload::Function {
arguments: "{}".to_string(),
},
};
crate::tools::registry::ToolExecutor::handle(&handler, invocation)
.await
.expect("extension call should succeed");
let started = rx.recv().await.expect("item started event");
let EventMsg::ItemStarted(started) = started.msg else {
panic!("expected item started event");
};
let TurnItem::ImageGeneration(started_item) = started.item else {
panic!("expected image generation item");
};
let begin = rx.recv().await.expect("legacy image start event");
assert!(matches!(begin.msg, EventMsg::ImageGenerationBegin(_)));
let completed = rx.recv().await.expect("item completed event");
let EventMsg::ItemCompleted(completed) = completed.msg else {
panic!("expected item completed event");
};
let TurnItem::ImageGeneration(completed_item) = completed.item else {
panic!("expected image generation item");
};
let end = rx.recv().await.expect("legacy image end event");
assert!(matches!(end.msg, EventMsg::ImageGenerationEnd(_)));
assert_eq!(
started_item,
codex_protocol::items::ImageGenerationItem {
id: "call-image".to_string(),
status: "in_progress".to_string(),
revised_prompt: None,
result: String::new(),
saved_path: None,
}
);
assert_eq!(
completed_item,
codex_protocol::items::ImageGenerationItem {
id: "call-image".to_string(),
status: "completed".to_string(),
revised_prompt: Some("A tiny blue square".to_string()),
result: "cG5n".to_string(),
saved_path: Some(expected_path.clone()),
}
);
assert_eq!(
std::fs::read(&expected_path).expect("generated artifact should be saved"),
b"png"
);
assert_eq!(
*output_hint.lock().await,
Some(format!(
"Generated images are saved to {} as {} by default.\n\
If you need to use a generated image at another path, copy it and leave the original in place unless the user explicitly asks you to delete it.",
expected_path
.parent()
.expect("generated image path should have a parent")
.display(),
expected_path.display(),
))
);
}
}

View File

@@ -13,6 +13,7 @@ pub use capabilities::ResponseItemInjector;
pub use codex_tools::ConversationHistory;
pub use codex_tools::ExtensionTurnItem;
pub use codex_tools::FunctionCallError;
pub use codex_tools::ImageGenerationCompletionFuture;
pub use codex_tools::JsonToolOutput;
pub use codex_tools::NoopTurnItemEmitter;
pub use codex_tools::ResponsesApiTool;

View File

@@ -14,7 +14,6 @@ workspace = true
[dependencies]
async-trait = { workspace = true }
base64 = { workspace = true }
codex-api = { workspace = true }
codex-core = { workspace = true }
codex-extension-api = { workspace = true }
@@ -28,10 +27,6 @@ http = { workspace = true }
schemars = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
tokio = { workspace = true, features = ["fs"] }
tracing = { workspace = true }
[dev-dependencies]
pretty_assertions = { workspace = true }
tempfile = { workspace = true }
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }

View File

@@ -1,4 +1,3 @@
use std::path::PathBuf;
use std::sync::Arc;
use codex_core::config::Config;
@@ -17,7 +16,6 @@ use codex_model_provider_info::ModelProviderInfo;
use crate::backend::CodexImagesBackend;
use crate::tool::ImageGenerationTool;
use crate::tool::generated_image_output_dir;
#[derive(Clone)]
struct ImageGenerationExtension {
@@ -28,7 +26,6 @@ struct ImageGenerationExtension {
struct ImageGenerationExtensionConfig {
enabled: bool,
provider: ModelProviderInfo,
codex_home: PathBuf,
}
impl From<&Config> for ImageGenerationExtensionConfig {
@@ -38,7 +35,6 @@ impl From<&Config> for ImageGenerationExtensionConfig {
enabled: config.features.enabled(Feature::ImageGenExt)
&& config.model_provider.is_openai(),
provider: config.model_provider.clone(),
codex_home: config.codex_home.to_path_buf(),
}
}
}
@@ -80,13 +76,9 @@ impl ToolContributor for ImageGenerationExtension {
return Vec::new();
}
vec![Arc::new(ImageGenerationTool::new(
CodexImagesBackend::new(create_model_provider(
config.provider.clone(),
Some(self.auth_manager.clone()),
)),
generated_image_output_dir(&config.codex_home, thread_store.level_id()),
))]
vec![Arc::new(ImageGenerationTool::new(CodexImagesBackend::new(
create_model_provider(config.provider.clone(), Some(self.auth_manager.clone())),
)))]
}
}

View File

@@ -20,14 +20,13 @@ use super::GeneratedImageOutput;
use super::ImageRequest;
use super::ImagegenAction;
use super::ImagegenArgs;
use super::generated_image_output_dir;
use super::imagegen_tool_spec;
use super::persist_generated_image;
use super::request_for_action;
use crate::IMAGE_GEN_NAMESPACE;
use crate::IMAGEGEN_TOOL_NAME;
const RESULT: &str = "cG5n";
const OUTPUT_HINT: &str = "Generated images are saved to /tmp as /tmp/call-1.png by default.";
#[test]
fn uses_reserved_image_gen_namespace() {
@@ -55,15 +54,11 @@ fn generate_uses_fixed_request_defaults() {
);
}
#[tokio::test]
async fn generated_output_returns_image_input_and_persists_artifact() {
let tempdir = tempfile::tempdir().expect("tempdir");
let output_hint = persist_generated_image(tempdir.path(), "call-1", RESULT)
.await
.expect("generated image should persist");
#[test]
fn generated_output_returns_image_input_and_output_hint() {
let output = GeneratedImageOutput {
result: RESULT.to_string(),
output_hint: Some(output_hint),
output_hint: Some(OUTPUT_HINT.to_string()),
};
let ResponseInputItem::FunctionCallOutput {
@@ -84,19 +79,10 @@ async fn generated_output_returns_image_input_and_persists_artifact() {
detail: Some(DEFAULT_IMAGE_DETAIL),
},
FunctionCallOutputContentItem::InputText {
text: format!(
"Generated images are saved to {} as {} by default.\n\
If you need to use a generated image at another path, copy it and leave the original in place unless the user explicitly asks you to delete it.",
tempdir.path().display(),
tempdir.path().join("call-1.png").display(),
),
text: OUTPUT_HINT.to_string(),
},
]
);
assert_eq!(
std::fs::read(tempdir.path().join("call-1.png")).expect("saved generated image"),
b"png"
);
}
#[test]
@@ -265,14 +251,6 @@ fn edit_without_image_history_returns_tool_error() {
);
}
#[test]
fn generated_image_output_dir_is_scoped_to_sanitized_thread_id() {
assert_eq!(
generated_image_output_dir(std::path::Path::new("/tmp/codex-home"), "thread/1"),
std::path::PathBuf::from("/tmp/codex-home/generated_images/thread_1")
);
}
fn args(action: ImagegenAction, prompt: &str) -> ImagegenArgs {
ImagegenArgs {
prompt: prompt.to_string(),

View File

@@ -1,8 +1,3 @@
use std::path::Path;
use std::path::PathBuf;
use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_api::ImageBackground;
use codex_api::ImageEditRequest;
use codex_api::ImageGenerationRequest;
@@ -41,21 +36,16 @@ use crate::backend::CodexImagesBackend;
const IMAGE_MODEL: &str = "gpt-image-2";
const MAX_EDIT_IMAGES: usize = 5;
const IMAGEGEN_DESCRIPTION: &str = include_str!("../imagegen_description.md");
const GENERATED_IMAGE_ARTIFACTS_DIR: &str = "generated_images";
#[derive(Clone)]
pub(crate) struct ImageGenerationTool {
backend: CodexImagesBackend,
output_dir: PathBuf,
}
impl ImageGenerationTool {
/// Creates an image-generation tool backed by an image API executor.
pub(crate) fn new(backend: CodexImagesBackend, output_dir: PathBuf) -> Self {
Self {
backend,
output_dir,
}
pub(crate) fn new(backend: CodexImagesBackend) -> Self {
Self { backend }
}
}
@@ -94,7 +84,6 @@ impl ToolExecutor<ToolCall> for ImageGenerationTool {
async fn handle(&self, call: ToolCall) -> Result<Box<dyn ToolOutput>, FunctionCallError> {
let args = parse_args(&call)?;
let request = request_for_action(&args, call.conversation_history.items())?;
let response = match request {
ImageRequest::Generate(request) => self.backend.generate(request).await,
ImageRequest::Edit(request) => self.backend.edit(request).await,
@@ -107,18 +96,10 @@ impl ToolExecutor<ToolCall> for ImageGenerationTool {
"image generation returned no image data".to_string(),
));
};
let output_hint =
match persist_generated_image(&self.output_dir, &call.call_id, &result).await {
Ok(output_hint) => Some(output_hint),
Err(err) => {
tracing::warn!(
call_id = %call.call_id,
output_dir = %self.output_dir.display(),
"failed to save generated image: {err}"
);
None
}
};
let output_hint = call
.turn_item_emitter
.image_generation_completed(call.call_id.clone(), args.prompt, result.clone())
.await;
Ok(Box::new(GeneratedImageOutput {
result,
output_hint,
@@ -268,58 +249,6 @@ fn parse_args(call: &ToolCall) -> Result<ImagegenArgs, FunctionCallError> {
.map_err(|err| FunctionCallError::RespondToModel(err.to_string()))
}
/// Resolves where generated images for one thread are persisted by the extension.
pub(crate) fn generated_image_output_dir(codex_home: &Path, thread_id: &str) -> PathBuf {
codex_home
.join(GENERATED_IMAGE_ARTIFACTS_DIR)
.join(sanitize_path_component(thread_id))
}
fn generated_image_output_path(output_dir: &Path, call_id: &str) -> PathBuf {
output_dir.join(format!("{}.png", sanitize_path_component(call_id)))
}
fn sanitize_path_component(value: &str) -> String {
let sanitized: String = value
.chars()
.map(|ch| {
if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
ch
} else {
'_'
}
})
.collect();
if sanitized.is_empty() {
"generated_image".to_string()
} else {
sanitized
}
}
async fn persist_generated_image(
output_dir: &Path,
call_id: &str,
result: &str,
) -> Result<String, String> {
let bytes = BASE64_STANDARD
.decode(result.trim().as_bytes())
.map_err(|err| format!("invalid image generation payload: {err}"))?;
tokio::fs::create_dir_all(output_dir)
.await
.map_err(|err| err.to_string())?;
tokio::fs::write(generated_image_output_path(output_dir, call_id), bytes)
.await
.map_err(|err| err.to_string())?;
Ok(format!(
"Generated images are saved to {} as {} by default.\n\
If you need to use a generated image at another path, copy it and leave the original in place unless the user explicitly asks you to delete it.",
output_dir.display(),
generated_image_output_path(output_dir, call_id).display(),
))
}
/// Builds the namespace function schema exposed to the model.
fn imagegen_tool_spec() -> ToolSpec {
let mut schema_value = serde_json::to_value(
@@ -369,7 +298,7 @@ impl ToolOutput for GeneratedImageOutput {
true
}
/// Returns generated bytes and persisted-artifact context for the model's follow-up response.
/// Returns generated bytes and persisted-artifact context for model follow-up.
fn to_response_item(&self, call_id: &str, _payload: &ToolPayload) -> ResponseInputItem {
let mut content = vec![FunctionCallOutputContentItem::InputImage {
image_url: format!("data:image/png;base64,{}", self.result),

View File

@@ -63,6 +63,7 @@ pub use responses_api::mcp_tool_to_responses_api_tool;
pub use responses_api::tool_definition_to_responses_api_tool;
pub use tool_call::ConversationHistory;
pub use tool_call::ExtensionTurnItem;
pub use tool_call::ImageGenerationCompletionFuture;
pub use tool_call::NoopTurnItemEmitter;
pub use tool_call::ToolCall;
pub use tool_call::TurnItemEmissionFuture;

View File

@@ -29,6 +29,10 @@ impl ConversationHistory {
/// Future returned when an extension tool emits a visible turn-item lifecycle event.
pub type TurnItemEmissionFuture<'a> = Pin<Box<dyn Future<Output = ()> + Send + 'a>>;
/// Future returned when an image-generation extension publishes completed image bytes.
pub type ImageGenerationCompletionFuture<'a> =
Pin<Box<dyn Future<Output = Option<String>> + Send + 'a>>;
/// Visible turn items that an extension fully owns and may emit as-is.
///
/// Add only item kinds that require no additional host finalization before
@@ -48,6 +52,19 @@ pub trait TurnItemEmitter: Send + Sync {
/// Emits the completion of one visible turn item.
fn emit_completed<'a>(&'a self, item: ExtensionTurnItem) -> TurnItemEmissionFuture<'a>;
/// Publishes image bytes for host persistence and visible completion.
///
/// Returns persisted-artifact context for the extension's model-facing
/// function output when the host saves the generated image successfully.
fn image_generation_completed<'a>(
&'a self,
_call_id: String,
_prompt: String,
_result: String,
) -> ImageGenerationCompletionFuture<'a> {
Box::pin(std::future::ready(None))
}
}
/// Turn-item emitter used when a caller does not expose visible item emission.