Support multimodal custom tool outputs (#12948)

## Summary This changes `custom_tool_call_output` to use the same output payload shape as `function_call_output`, so freeform tools can return either plain text or structured content items. The main goal is to let `js_repl` return image content from nested `view_image` calls in its own `custom_tool_call_output`, instead of relying on a separate injected message. ## What changed - Changed `custom_tool_call_output.output` from `string` to `FunctionCallOutputPayload` - Updated freeform tool plumbing to preserve structured output bodies - Updated `js_repl` to aggregate nested tool content items and attach them to the outer `js_repl` result - Removed the old `js_repl` special case that injected `view_image` results as a separate pending user image message - Updated normalization/history/truncation paths to handle multimodal `custom_tool_call_output` - Regenerated app-server protocol schema artifacts ## Behavior Direct `view_image` calls still return a `function_call_output` with image content. When `view_image` is called inside `js_repl`, the outer `js_repl` `custom_tool_call_output` now carries: - an `input_text` item if the JS produced text output - one or more `input_image` items from nested tool results So the nested image result now stays inside the `js_repl` tool output instead of being injected as a separate message. ## Compatibility This is intended to be backward-compatible for resumed conversations. Older histories that stored `custom_tool_call_output.output` as a plain string still deserialize correctly, and older histories that used the previous injected-image-message flow also continue to resume. Added regression coverage for resuming a pre-change rollout containing: - string-valued `custom_tool_call_output` - legacy injected image message history #### [git stack](https://github.com/magus/git-stack-cli) - 👉 `1` https://github.com/openai/codex/pull/12948
2026-04-29 08:56:38 +00:00 · 2026-02-26 18:17:46 -08:00
parent f90e97e414
commit 7e980d7db6
20 changed files with 688 additions and 177 deletions
--- a/codex-rs/core/src/client_common.rs
+++ b/codex-rs/core/src/client_common.rs
@@ -84,19 +84,13 @@ fn reserialize_shell_outputs(items: &mut [ResponseItem]) {
                shell_call_ids.insert(call_id.clone());
            }
        }
-        ResponseItem::CustomToolCallOutput { call_id, output } => {
-            if shell_call_ids.remove(call_id)
-                && let Some(structured) = parse_structured_shell_output(output)
-            {
-                *output = structured
-            }
-        }
        ResponseItem::FunctionCall { name, call_id, .. }
            if is_shell_tool_name(name) || name == "apply_patch" =>
        {
            shell_call_ids.insert(call_id.clone());
        }
-        ResponseItem::FunctionCallOutput { call_id, output } => {
+        ResponseItem::FunctionCallOutput { call_id, output }
+        | ResponseItem::CustomToolCallOutput { call_id, output } => {
            if shell_call_ids.remove(call_id)
                && let Some(structured) = output
                    .text_content()
@@ -240,6 +234,7 @@ mod tests {
    use codex_api::common::OpenAiVerbosity;
    use codex_api::common::TextControls;
    use codex_api::create_text_param_for_request;
+    use codex_protocol::models::FunctionCallOutputPayload;
    use pretty_assertions::assert_eq;

    use super::*;
@@ -343,4 +338,62 @@ mod tests {
        let v = serde_json::to_value(&req).expect("json");
        assert!(v.get("text").is_none());
    }
+
+    #[test]
+    fn reserializes_shell_outputs_for_function_and_custom_tool_calls() {
+        let raw_output = r#"{"output":"hello","metadata":{"exit_code":0,"duration_seconds":0.5}}"#;
+        let expected_output = "Exit code: 0\nWall time: 0.5 seconds\nOutput:\nhello";
+        let mut items = vec![
+            ResponseItem::FunctionCall {
+                id: None,
+                name: "shell".to_string(),
+                arguments: "{}".to_string(),
+                call_id: "call-1".to_string(),
+            },
+            ResponseItem::FunctionCallOutput {
+                call_id: "call-1".to_string(),
+                output: FunctionCallOutputPayload::from_text(raw_output.to_string()),
+            },
+            ResponseItem::CustomToolCall {
+                id: None,
+                status: None,
+                call_id: "call-2".to_string(),
+                name: "apply_patch".to_string(),
+                input: "*** Begin Patch".to_string(),
+            },
+            ResponseItem::CustomToolCallOutput {
+                call_id: "call-2".to_string(),
+                output: FunctionCallOutputPayload::from_text(raw_output.to_string()),
+            },
+        ];
+
+        reserialize_shell_outputs(&mut items);
+
+        assert_eq!(
+            items,
+            vec![
+                ResponseItem::FunctionCall {
+                    id: None,
+                    name: "shell".to_string(),
+                    arguments: "{}".to_string(),
+                    call_id: "call-1".to_string(),
+                },
+                ResponseItem::FunctionCallOutput {
+                    call_id: "call-1".to_string(),
+                    output: FunctionCallOutputPayload::from_text(expected_output.to_string()),
+                },
+                ResponseItem::CustomToolCall {
+                    id: None,
+                    status: None,
+                    call_id: "call-2".to_string(),
+                    name: "apply_patch".to_string(),
+                    input: "*** Begin Patch".to_string(),
+                },
+                ResponseItem::CustomToolCallOutput {
+                    call_id: "call-2".to_string(),
+                    output: FunctionCallOutputPayload::from_text(expected_output.to_string()),
+                },
+            ]
+        );
+    }
 }