Support multimodal custom tool outputs (#12948)

## Summary This changes `custom_tool_call_output` to use the same output payload shape as `function_call_output`, so freeform tools can return either plain text or structured content items. The main goal is to let `js_repl` return image content from nested `view_image` calls in its own `custom_tool_call_output`, instead of relying on a separate injected message. ## What changed - Changed `custom_tool_call_output.output` from `string` to `FunctionCallOutputPayload` - Updated freeform tool plumbing to preserve structured output bodies - Updated `js_repl` to aggregate nested tool content items and attach them to the outer `js_repl` result - Removed the old `js_repl` special case that injected `view_image` results as a separate pending user image message - Updated normalization/history/truncation paths to handle multimodal `custom_tool_call_output` - Regenerated app-server protocol schema artifacts ## Behavior Direct `view_image` calls still return a `function_call_output` with image content. When `view_image` is called inside `js_repl`, the outer `js_repl` `custom_tool_call_output` now carries: - an `input_text` item if the JS produced text output - one or more `input_image` items from nested tool results So the nested image result now stays inside the `js_repl` tool output instead of being injected as a separate message. ## Compatibility This is intended to be backward-compatible for resumed conversations. Older histories that stored `custom_tool_call_output.output` as a plain string still deserialize correctly, and older histories that used the previous injected-image-message flow also continue to resume. Added regression coverage for resuming a pre-change rollout containing: - string-valued `custom_tool_call_output` - legacy injected image message history #### [git stack](https://github.com/magus/git-stack-cli) - 👉 `1` https://github.com/openai/codex/pull/12948
2026-04-28 16:45:54 +00:00 · 2026-02-26 18:17:46 -08:00
parent f90e97e414
commit 7e980d7db6
20 changed files with 688 additions and 177 deletions
--- a/codex-rs/core/tests/suite/client.rs
+++ b/codex-rs/core/tests/suite/client.rs
@@ -35,6 +35,10 @@ use codex_protocol::openai_models::ModelsResponse;
 use codex_protocol::openai_models::ReasoningEffort;
 use codex_protocol::protocol::EventMsg;
 use codex_protocol::protocol::Op;
+use codex_protocol::protocol::RolloutItem;
+use codex_protocol::protocol::RolloutLine;
+use codex_protocol::protocol::SessionMeta;
+use codex_protocol::protocol::SessionMetaLine;
 use codex_protocol::protocol::SessionSource;
 use codex_protocol::user_input::UserInput;
 use core_test_support::apps_test_server::AppsTestServer;
@@ -344,6 +348,144 @@ async fn resume_includes_initial_messages_and_sends_prior_items() {
    assert!(pos_environment < pos_new_user);
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn resume_replays_legacy_js_repl_image_rollout_shapes() {
+    skip_if_no_network!();
+
+    // Early js_repl builds persisted image tool results as two separate rollout items:
+    // a string-valued custom_tool_call_output plus a standalone user input_image message.
+    // Current image tests cover today's shapes; this keeps resume compatibility for that
+    // legacy rollout representation.
+    let legacy_custom_tool_call = ResponseItem::CustomToolCall {
+        id: None,
+        status: None,
+        call_id: "legacy-js-call".to_string(),
+        name: "js_repl".to_string(),
+        input: "console.log('legacy image flow')".to_string(),
+    };
+    let legacy_image_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
+    let rollout = vec![
+        RolloutLine {
+            timestamp: "2024-01-01T00:00:00.000Z".to_string(),
+            item: RolloutItem::SessionMeta(SessionMetaLine {
+                meta: SessionMeta {
+                    id: ThreadId::default(),
+                    timestamp: "2024-01-01T00:00:00Z".to_string(),
+                    cwd: ".".into(),
+                    originator: "test_originator".to_string(),
+                    cli_version: "test_version".to_string(),
+                    model_provider: Some("test-provider".to_string()),
+                    ..Default::default()
+                },
+                git: None,
+            }),
+        },
+        RolloutLine {
+            timestamp: "2024-01-01T00:00:01.000Z".to_string(),
+            item: RolloutItem::ResponseItem(legacy_custom_tool_call),
+        },
+        RolloutLine {
+            timestamp: "2024-01-01T00:00:02.000Z".to_string(),
+            item: RolloutItem::ResponseItem(ResponseItem::CustomToolCallOutput {
+                call_id: "legacy-js-call".to_string(),
+                output: FunctionCallOutputPayload::from_text("legacy js_repl stdout".to_string()),
+            }),
+        },
+        RolloutLine {
+            timestamp: "2024-01-01T00:00:03.000Z".to_string(),
+            item: RolloutItem::ResponseItem(ResponseItem::Message {
+                id: None,
+                role: "user".to_string(),
+                content: vec![ContentItem::InputImage {
+                    image_url: legacy_image_url.to_string(),
+                }],
+                end_turn: None,
+                phase: None,
+            }),
+        },
+    ];
+
+    let tmpdir = TempDir::new().unwrap();
+    let session_path = tmpdir
+        .path()
+        .join("resume-legacy-js-repl-image-rollout.jsonl");
+    let mut f = std::fs::File::create(&session_path).unwrap();
+    for line in rollout {
+        writeln!(f, "{}", serde_json::to_string(&line).unwrap()).unwrap();
+    }
+
+    let server = MockServer::start().await;
+    let resp_mock = mount_sse_once(
+        &server,
+        sse(vec![ev_response_created("resp1"), ev_completed("resp1")]),
+    )
+    .await;
+
+    let codex_home = Arc::new(TempDir::new().unwrap());
+    let mut builder = test_codex().with_model("gpt-5.1");
+    let test = builder
+        .resume(&server, codex_home, session_path.clone())
+        .await
+        .expect("resume conversation");
+    test.submit_turn("after resume").await.unwrap();
+
+    let input = resp_mock.single_request().input();
+
+    let legacy_output_index = input
+        .iter()
+        .position(|item| {
+            item.get("type").and_then(|value| value.as_str()) == Some("custom_tool_call_output")
+                && item.get("call_id").and_then(|value| value.as_str()) == Some("legacy-js-call")
+        })
+        .expect("legacy custom tool output should be replayed");
+    assert_eq!(
+        input[legacy_output_index]
+            .get("output")
+            .and_then(|value| value.as_str()),
+        Some("legacy js_repl stdout")
+    );
+
+    let legacy_image_index = input
+        .iter()
+        .position(|item| {
+            item.get("type").and_then(|value| value.as_str()) == Some("message")
+                && item.get("role").and_then(|value| value.as_str()) == Some("user")
+                && item
+                    .get("content")
+                    .and_then(|value| value.as_array())
+                    .is_some_and(|content| {
+                        content.iter().any(|entry| {
+                            entry.get("type").and_then(|value| value.as_str())
+                                == Some("input_image")
+                                && entry.get("image_url").and_then(|value| value.as_str())
+                                    == Some(legacy_image_url)
+                        })
+                    })
+        })
+        .expect("legacy injected image message should be replayed");
+
+    let new_user_index = input
+        .iter()
+        .position(|item| {
+            item.get("type").and_then(|value| value.as_str()) == Some("message")
+                && item.get("role").and_then(|value| value.as_str()) == Some("user")
+                && item
+                    .get("content")
+                    .and_then(|value| value.as_array())
+                    .is_some_and(|content| {
+                        content.iter().any(|entry| {
+                            entry.get("type").and_then(|value| value.as_str()) == Some("input_text")
+                                && entry.get("text").and_then(|value| value.as_str())
+                                    == Some("after resume")
+                        })
+                    })
+        })
+        .expect("new user message should be present");
+
+    assert!(legacy_output_index < new_user_index);
+    assert!(legacy_image_index < new_user_index);
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn includes_conversation_id_and_model_headers_in_request() {
    skip_if_no_network!();
@@ -1565,7 +1707,7 @@ async fn azure_responses_request_includes_store_and_reasoning_ids() {
    });
    prompt.input.push(ResponseItem::CustomToolCallOutput {
        call_id: "custom-tool-call-id".into(),
-        output: "ok".into(),
+        output: FunctionCallOutputPayload::from_text("ok".into()),
    });

    let mut stream = client_session
--- a/codex-rs/core/tests/suite/view_image.rs
+++ b/codex-rs/core/tests/suite/view_image.rs
@@ -358,40 +358,26 @@ console.log(out.output?.body?.text ?? "");
    .await;

    let req = mock.single_request();
-    let (js_repl_output, js_repl_success) = req
-        .custom_tool_call_output_content_and_success(call_id)
-        .expect("custom tool output present");
-    let js_repl_output = js_repl_output.expect("custom tool output text present");
-    assert_ne!(
-        js_repl_success,
-        Some(false),
-        "js_repl call failed unexpectedly: {js_repl_output}"
+    let body = req.body_json();
+    assert_eq!(
+        image_messages(&body).len(),
+        0,
+        "js_repl view_image should not inject a pending input image message"
    );

-    let body = req.body_json();
-    let image_messages = image_messages(&body);
-    assert_eq!(
-        image_messages.len(),
-        1,
-        "js_repl view_image should inject exactly one pending input image message"
-    );
-    let image_message = image_messages
-        .into_iter()
-        .next()
-        .expect("pending input image message not included in request");
-    let image_url = image_message
-        .get("content")
+    let custom_output = req.custom_tool_call_output(call_id);
+    let output_items = custom_output
+        .get("output")
        .and_then(Value::as_array)
-        .and_then(|content| {
-            content.iter().find_map(|span| {
-                if span.get("type").and_then(Value::as_str) == Some("input_image") {
-                    span.get("image_url").and_then(Value::as_str)
-                } else {
-                    None
-                }
-            })
+        .expect("custom_tool_call_output should be a content item array");
+    let image_url = output_items
+        .iter()
+        .find_map(|item| {
+            (item.get("type").and_then(Value::as_str) == Some("input_image"))
+                .then(|| item.get("image_url").and_then(Value::as_str))
+                .flatten()
        })
-        .expect("image_url present");
+        .expect("image_url present in js_repl custom tool output");
    assert!(
        image_url.starts_with("data:image/png;base64,"),
        "expected png data URL, got {image_url}"