Support multimodal custom tool outputs (#12948)

## Summary

This changes `custom_tool_call_output` to use the same output payload
shape as `function_call_output`, so freeform tools can return either
plain text or structured content items.

The main goal is to let `js_repl` return image content from nested
`view_image` calls in its own `custom_tool_call_output`, instead of
relying on a separate injected message.

## What changed

- Changed `custom_tool_call_output.output` from `string` to
`FunctionCallOutputPayload`
- Updated freeform tool plumbing to preserve structured output bodies
- Updated `js_repl` to aggregate nested tool content items and attach
them to the outer `js_repl` result
- Removed the old `js_repl` special case that injected `view_image`
results as a separate pending user image message
- Updated normalization/history/truncation paths to handle multimodal
`custom_tool_call_output`
- Regenerated app-server protocol schema artifacts

## Behavior

Direct `view_image` calls still return a `function_call_output` with
image content.

When `view_image` is called inside `js_repl`, the outer `js_repl`
`custom_tool_call_output` now carries:
- an `input_text` item if the JS produced text output
- one or more `input_image` items from nested tool results

So the nested image result now stays inside the `js_repl` tool output
instead of being injected as a separate message.

## Compatibility

This is intended to be backward-compatible for resumed conversations.

Older histories that stored `custom_tool_call_output.output` as a plain
string still deserialize correctly, and older histories that used the
previous injected-image-message flow also continue to resume.

Added regression coverage for resuming a pre-change rollout containing:
- string-valued `custom_tool_call_output`
- legacy injected image message history


#### [git stack](https://github.com/magus/git-stack-cli)
- 👉 `1` https://github.com/openai/codex/pull/12948
This commit is contained in:
Curtis 'Fjord' Hawthorne
2026-02-26 18:17:46 -08:00
committed by GitHub
parent f90e97e414
commit 7e980d7db6
20 changed files with 688 additions and 177 deletions

View File

@@ -35,6 +35,10 @@ use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::openai_models::ReasoningEffort;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::Op;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::RolloutLine;
use codex_protocol::protocol::SessionMeta;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::SessionSource;
use codex_protocol::user_input::UserInput;
use core_test_support::apps_test_server::AppsTestServer;
@@ -344,6 +348,144 @@ async fn resume_includes_initial_messages_and_sends_prior_items() {
assert!(pos_environment < pos_new_user);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn resume_replays_legacy_js_repl_image_rollout_shapes() {
skip_if_no_network!();
// Early js_repl builds persisted image tool results as two separate rollout items:
// a string-valued custom_tool_call_output plus a standalone user input_image message.
// Current image tests cover today's shapes; this keeps resume compatibility for that
// legacy rollout representation.
let legacy_custom_tool_call = ResponseItem::CustomToolCall {
id: None,
status: None,
call_id: "legacy-js-call".to_string(),
name: "js_repl".to_string(),
input: "console.log('legacy image flow')".to_string(),
};
let legacy_image_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
let rollout = vec![
RolloutLine {
timestamp: "2024-01-01T00:00:00.000Z".to_string(),
item: RolloutItem::SessionMeta(SessionMetaLine {
meta: SessionMeta {
id: ThreadId::default(),
timestamp: "2024-01-01T00:00:00Z".to_string(),
cwd: ".".into(),
originator: "test_originator".to_string(),
cli_version: "test_version".to_string(),
model_provider: Some("test-provider".to_string()),
..Default::default()
},
git: None,
}),
},
RolloutLine {
timestamp: "2024-01-01T00:00:01.000Z".to_string(),
item: RolloutItem::ResponseItem(legacy_custom_tool_call),
},
RolloutLine {
timestamp: "2024-01-01T00:00:02.000Z".to_string(),
item: RolloutItem::ResponseItem(ResponseItem::CustomToolCallOutput {
call_id: "legacy-js-call".to_string(),
output: FunctionCallOutputPayload::from_text("legacy js_repl stdout".to_string()),
}),
},
RolloutLine {
timestamp: "2024-01-01T00:00:03.000Z".to_string(),
item: RolloutItem::ResponseItem(ResponseItem::Message {
id: None,
role: "user".to_string(),
content: vec![ContentItem::InputImage {
image_url: legacy_image_url.to_string(),
}],
end_turn: None,
phase: None,
}),
},
];
let tmpdir = TempDir::new().unwrap();
let session_path = tmpdir
.path()
.join("resume-legacy-js-repl-image-rollout.jsonl");
let mut f = std::fs::File::create(&session_path).unwrap();
for line in rollout {
writeln!(f, "{}", serde_json::to_string(&line).unwrap()).unwrap();
}
let server = MockServer::start().await;
let resp_mock = mount_sse_once(
&server,
sse(vec![ev_response_created("resp1"), ev_completed("resp1")]),
)
.await;
let codex_home = Arc::new(TempDir::new().unwrap());
let mut builder = test_codex().with_model("gpt-5.1");
let test = builder
.resume(&server, codex_home, session_path.clone())
.await
.expect("resume conversation");
test.submit_turn("after resume").await.unwrap();
let input = resp_mock.single_request().input();
let legacy_output_index = input
.iter()
.position(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("custom_tool_call_output")
&& item.get("call_id").and_then(|value| value.as_str()) == Some("legacy-js-call")
})
.expect("legacy custom tool output should be replayed");
assert_eq!(
input[legacy_output_index]
.get("output")
.and_then(|value| value.as_str()),
Some("legacy js_repl stdout")
);
let legacy_image_index = input
.iter()
.position(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("message")
&& item.get("role").and_then(|value| value.as_str()) == Some("user")
&& item
.get("content")
.and_then(|value| value.as_array())
.is_some_and(|content| {
content.iter().any(|entry| {
entry.get("type").and_then(|value| value.as_str())
== Some("input_image")
&& entry.get("image_url").and_then(|value| value.as_str())
== Some(legacy_image_url)
})
})
})
.expect("legacy injected image message should be replayed");
let new_user_index = input
.iter()
.position(|item| {
item.get("type").and_then(|value| value.as_str()) == Some("message")
&& item.get("role").and_then(|value| value.as_str()) == Some("user")
&& item
.get("content")
.and_then(|value| value.as_array())
.is_some_and(|content| {
content.iter().any(|entry| {
entry.get("type").and_then(|value| value.as_str()) == Some("input_text")
&& entry.get("text").and_then(|value| value.as_str())
== Some("after resume")
})
})
})
.expect("new user message should be present");
assert!(legacy_output_index < new_user_index);
assert!(legacy_image_index < new_user_index);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn includes_conversation_id_and_model_headers_in_request() {
skip_if_no_network!();
@@ -1565,7 +1707,7 @@ async fn azure_responses_request_includes_store_and_reasoning_ids() {
});
prompt.input.push(ResponseItem::CustomToolCallOutput {
call_id: "custom-tool-call-id".into(),
output: "ok".into(),
output: FunctionCallOutputPayload::from_text("ok".into()),
});
let mut stream = client_session

View File

@@ -358,40 +358,26 @@ console.log(out.output?.body?.text ?? "");
.await;
let req = mock.single_request();
let (js_repl_output, js_repl_success) = req
.custom_tool_call_output_content_and_success(call_id)
.expect("custom tool output present");
let js_repl_output = js_repl_output.expect("custom tool output text present");
assert_ne!(
js_repl_success,
Some(false),
"js_repl call failed unexpectedly: {js_repl_output}"
let body = req.body_json();
assert_eq!(
image_messages(&body).len(),
0,
"js_repl view_image should not inject a pending input image message"
);
let body = req.body_json();
let image_messages = image_messages(&body);
assert_eq!(
image_messages.len(),
1,
"js_repl view_image should inject exactly one pending input image message"
);
let image_message = image_messages
.into_iter()
.next()
.expect("pending input image message not included in request");
let image_url = image_message
.get("content")
let custom_output = req.custom_tool_call_output(call_id);
let output_items = custom_output
.get("output")
.and_then(Value::as_array)
.and_then(|content| {
content.iter().find_map(|span| {
if span.get("type").and_then(Value::as_str) == Some("input_image") {
span.get("image_url").and_then(Value::as_str)
} else {
None
}
})
.expect("custom_tool_call_output should be a content item array");
let image_url = output_items
.iter()
.find_map(|item| {
(item.get("type").and_then(Value::as_str) == Some("input_image"))
.then(|| item.get("image_url").and_then(Value::as_str))
.flatten()
})
.expect("image_url present");
.expect("image_url present in js_repl custom tool output");
assert!(
image_url.starts_with("data:image/png;base64,"),
"expected png data URL, got {image_url}"