Fix: Render MCP image outputs regardless of ordering (#9815)

## What? - Render an MCP image output cell whenever a decodable image block exists in `CallToolResult.content` (including text-before-image or malformed image before valid image). ## Why? - Tool results that include caption text before the image currently drop the image output cell. - A malformed image block can also suppress later valid image output. ## How? - Iterate `content` and return the first successfully decoded image instead of only checking the first block. - Add unit tests that cover text-before-image ordering and invalid-image-before-valid. ## Before ```rust let image = match result { Ok(mcp_types::CallToolResult { content, .. }) => { if let Some(mcp_types::ContentBlock::ImageContent(image)) = content.first() { // decode image (fails -> None) } else { None } } _ => None, }?; ``` ## After ```rust let image = result .as_ref() .ok()? .content .iter() .find_map(decode_mcp_image)?; ``` ## Risk / Impact - Low: only affects image cell creation for MCP tool results; no change for non-image outputs. ## Tests - [x] `just fmt` - [x] `cargo test -p codex-tui` - [x] Rerun after branch update (2026-01-27): `just fmt`, `cargo test -p codex-tui` Manual testing # Manual testing: MCP image tool result rendering (Codex TUI) # Build the rmcp stdio test server binary: cd codex-rs cargo build -p codex-rmcp-client --bin test_stdio_server # Register the server as an MCP server (absolute path to the built binary): codex mcp add mcpimg -- /Users/joshka/code/codex-pr-review/codex-rs/target/debug/test_stdio_server # Then in Codex TUI, ask it to call: - mcpimg.image_scenario({"scenario":"image_only"}) - mcpimg.image_scenario({"scenario":"text_then_image","caption":"Here is the image:"}) - mcpimg.image_scenario({"scenario":"invalid_base64_then_image"}) - mcpimg.image_scenario({"scenario":"invalid_image_bytes_then_image"}) - mcpimg.image_scenario({"scenario":"multiple_valid_images"}) - mcpimg.image_scenario({"scenario":"image_then_text","caption":"Here is the image:"}) - mcpimg.image_scenario({"scenario":"text_only","caption":"Here is the image:"}) # Expected: # - You should see an extra history cell: "tool result (image output)" when the # tool result contains at least one decodable image block (even if earlier # blocks are text or invalid images). Fixes #9814 --------- Co-authored-by: Josh McKinney <joshka@openai.com>
2026-02-01 22:47:52 +00:00 · 2026-01-28 08:14:08 +11:00
parent 28051d18c6
commit 337643b00a
2 changed files with 296 additions and 42 deletions
--- a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
+++ b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
@@ -35,12 +35,19 @@ struct TestToolServer {

 const MEMO_URI: &str = "memo://codex/example-note";
 const MEMO_CONTENT: &str = "This is a sample MCP resource served by the rmcp test server.";
+const SMALL_PNG_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
+
 pub fn stdio() -> (tokio::io::Stdin, tokio::io::Stdout) {
    (tokio::io::stdin(), tokio::io::stdout())
 }
+
 impl TestToolServer {
    fn new() -> Self {
-        let tools = vec![Self::echo_tool(), Self::image_tool()];
+        let tools = vec![
+            Self::echo_tool(),
+            Self::image_tool(),
+            Self::image_scenario_tool(),
+        ];
        let resources = vec![Self::memo_resource()];
        let resource_templates = vec![Self::memo_template()];
        Self {
@@ -86,6 +93,61 @@ impl TestToolServer {
        )
    }

+    /// Tool intended for manual testing of Codex TUI rendering for MCP image tool results.
+    ///
+    /// This exists to exercise edge cases where a `CallToolResult.content` includes image blocks
+    /// that aren't the first item (or includes invalid image blocks before a valid image).
+    ///
+    /// Manual testing approach (Codex TUI):
+    /// - Build this binary: `cargo build -p codex-rmcp-client --bin test_stdio_server`
+    /// - Register it:
+    ///   - `codex mcp add mcpimg -- /abs/path/to/test_stdio_server`
+    /// - Then in Codex TUI, ask it to call:
+    ///   - `mcpimg.image_scenario({"scenario":"image_only"})`
+    ///   - `mcpimg.image_scenario({"scenario":"text_then_image","caption":"Here is the image:"})`
+    ///   - `mcpimg.image_scenario({"scenario":"invalid_base64_then_image"})`
+    ///   - `mcpimg.image_scenario({"scenario":"invalid_image_bytes_then_image"})`
+    ///   - `mcpimg.image_scenario({"scenario":"multiple_valid_images"})`
+    ///   - `mcpimg.image_scenario({"scenario":"image_then_text","caption":"Here is the image:"})`
+    ///   - `mcpimg.image_scenario({"scenario":"text_only","caption":"Here is the image:"})`
+    /// - You should see an extra history cell: `tool result (image output)`.
+    fn image_scenario_tool() -> Tool {
+        #[expect(clippy::expect_used)]
+        let schema: JsonObject = serde_json::from_value(serde_json::json!({
+            "type": "object",
+            "properties": {
+                "scenario": {
+                    "type": "string",
+                    "enum": [
+                        "image_only",
+                        "text_then_image",
+                        "invalid_base64_then_image",
+                        "invalid_image_bytes_then_image",
+                        "multiple_valid_images",
+                        "image_then_text",
+                        "text_only"
+                    ]
+                },
+                "caption": { "type": "string" },
+                "data_url": {
+                    "type": "string",
+                    "description": "Optional data URL like data:image/png;base64,AAAA...; if omitted, uses a built-in tiny PNG."
+                }
+            },
+            "required": ["scenario"],
+            "additionalProperties": false
+        }))
+        .expect("image_scenario tool schema should deserialize");
+
+        Tool::new(
+            Cow::Borrowed("image_scenario"),
+            Cow::Borrowed(
+                "Return content blocks for manual testing of MCP image rendering scenarios.",
+            ),
+            Arc::new(schema),
+        )
+    }
+
    fn memo_resource() -> Resource {
        let raw = RawResource {
            uri: MEMO_URI.to_string(),
@@ -125,6 +187,32 @@ struct EchoArgs {
    env_var: Option<String>,
 }

+#[derive(Deserialize, Debug)]
+#[serde(rename_all = "snake_case")]
+/// Scenarios for `image_scenario`, intended to exercise Codex TUI handling of MCP image outputs.
+///
+/// The key behavior under test is that the TUI should render an image output cell if *any*
+/// decodable image block exists in the tool result content, even if the first block is text or an
+/// invalid image.
+enum ImageScenario {
+    ImageOnly,
+    TextThenImage,
+    InvalidBase64ThenImage,
+    InvalidImageBytesThenImage,
+    MultipleValidImages,
+    ImageThenText,
+    TextOnly,
+}
+
+#[derive(Deserialize, Debug)]
+struct ImageScenarioArgs {
+    scenario: ImageScenario,
+    #[serde(default)]
+    caption: Option<String>,
+    #[serde(default)]
+    data_url: Option<String>,
+}
+
 impl ServerHandler for TestToolServer {
    fn get_info(&self) -> ServerInfo {
        ServerInfo {
@@ -244,14 +332,6 @@ impl ServerHandler for TestToolServer {
                    )
                })?;

-                fn parse_data_url(url: &str) -> Option<(String, String)> {
-                    let rest = url.strip_prefix("data:")?;
-                    let (mime_and_opts, data) = rest.split_once(',')?;
-                    let (mime, _opts) =
-                        mime_and_opts.split_once(';').unwrap_or((mime_and_opts, ""));
-                    Some((mime.to_string(), data.to_string()))
-                }
-
                let (mime_type, data_b64) = parse_data_url(&data_url).ok_or_else(|| {
                    McpError::invalid_params(
                        format!("invalid data URL for image tool: {data_url}"),
@@ -263,6 +343,10 @@ impl ServerHandler for TestToolServer {
                    data_b64, mime_type,
                )]))
            }
+            "image_scenario" => {
+                let args = Self::parse_call_args::<ImageScenarioArgs>(&request, "image_scenario")?;
+                Self::image_scenario_result(args)
+            }
            other => Err(McpError::invalid_params(
                format!("unknown tool: {other}"),
                None,
@@ -271,6 +355,89 @@ impl ServerHandler for TestToolServer {
    }
 }

+impl TestToolServer {
+    fn parse_call_args<T: for<'de> Deserialize<'de>>(
+        request: &CallToolRequestParam,
+        tool_name: &'static str,
+    ) -> Result<T, McpError> {
+        match request.arguments.as_ref() {
+            Some(arguments) => serde_json::from_value(serde_json::Value::Object(
+                arguments.clone().into_iter().collect(),
+            ))
+            .map_err(|err| McpError::invalid_params(err.to_string(), None)),
+            None => Err(McpError::invalid_params(
+                format!("missing arguments for {tool_name} tool"),
+                None,
+            )),
+        }
+    }
+
+    fn image_scenario_result(args: ImageScenarioArgs) -> Result<CallToolResult, McpError> {
+        let (mime_type, valid_data_b64) = if let Some(data_url) = &args.data_url {
+            parse_data_url(data_url).ok_or_else(|| {
+                McpError::invalid_params(
+                    format!("invalid data_url for image_scenario tool: {data_url}"),
+                    None,
+                )
+            })?
+        } else {
+            ("image/png".to_string(), SMALL_PNG_BASE64.to_string())
+        };
+
+        let caption = args
+            .caption
+            .unwrap_or_else(|| "Here is the image:".to_string());
+
+        let mut content = Vec::new();
+        match args.scenario {
+            ImageScenario::ImageOnly => {
+                content.push(rmcp::model::Content::image(valid_data_b64, mime_type));
+            }
+            ImageScenario::TextThenImage => {
+                content.push(rmcp::model::Content::text(caption));
+                content.push(rmcp::model::Content::image(valid_data_b64, mime_type));
+            }
+            ImageScenario::InvalidBase64ThenImage => {
+                content.push(rmcp::model::Content::image(
+                    "not-base64".to_string(),
+                    "image/png".to_string(),
+                ));
+                content.push(rmcp::model::Content::image(valid_data_b64, mime_type));
+            }
+            ImageScenario::InvalidImageBytesThenImage => {
+                content.push(rmcp::model::Content::image(
+                    "bm90IGFuIGltYWdl".to_string(),
+                    "image/png".to_string(),
+                ));
+                content.push(rmcp::model::Content::image(valid_data_b64, mime_type));
+            }
+            ImageScenario::MultipleValidImages => {
+                content.push(rmcp::model::Content::image(
+                    valid_data_b64.clone(),
+                    mime_type.clone(),
+                ));
+                content.push(rmcp::model::Content::image(valid_data_b64, mime_type));
+            }
+            ImageScenario::ImageThenText => {
+                content.push(rmcp::model::Content::image(valid_data_b64, mime_type));
+                content.push(rmcp::model::Content::text(caption));
+            }
+            ImageScenario::TextOnly => {
+                content.push(rmcp::model::Content::text(caption));
+            }
+        }
+
+        Ok(CallToolResult::success(content))
+    }
+}
+
+fn parse_data_url(url: &str) -> Option<(String, String)> {
+    let rest = url.strip_prefix("data:")?;
+    let (mime_and_opts, data) = rest.split_once(',')?;
+    let (mime, _opts) = mime_and_opts.split_once(';').unwrap_or((mime_and_opts, ""));
+    Some((mime.to_string(), data.to_string()))
+}
+
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
    eprintln!("starting rmcp test server");
--- a/codex-rs/tui/src/history_cell.rs
+++ b/codex-rs/tui/src/history_cell.rs
@@ -1429,44 +1429,62 @@ pub(crate) fn new_web_search_call(
    cell
 }

-/// If the first content is an image, return a new cell with the image.
-/// TODO(rgwood-dd): Handle images properly even if they're not the first result.
+/// Returns an additional history cell if an MCP tool result includes a decodable image.
+///
+/// This intentionally returns at most one cell: the first image in `CallToolResult.content` that
+/// successfully base64-decodes and parses as an image. This is used as a lightweight “image output
+/// exists” affordance separate from the main MCP tool call cell.
+///
+/// Manual testing tip:
+/// - Run the rmcp stdio test server (`codex-rs/rmcp-client/src/bin/test_stdio_server.rs`) and
+///   register it as an MCP server via `codex mcp add`.
+/// - Use its `image_scenario` tool with cases like `text_then_image`,
+///   `invalid_base64_then_image`, or `invalid_image_bytes_then_image` to ensure this path triggers
+///   even when the first block is not a valid image.
 fn try_new_completed_mcp_tool_call_with_image_output(
    result: &Result<mcp_types::CallToolResult, String>,
 ) -> Option<CompletedMcpToolCallWithImageOutput> {
-    match result {
-        Ok(mcp_types::CallToolResult { content, .. }) => {
-            if let Some(mcp_types::ContentBlock::ImageContent(image)) = content.first() {
-                let raw_data = match base64::engine::general_purpose::STANDARD.decode(&image.data) {
-                    Ok(data) => data,
-                    Err(e) => {
-                        error!("Failed to decode image data: {e}");
-                        return None;
-                    }
-                };
-                let reader = match ImageReader::new(Cursor::new(raw_data)).with_guessed_format() {
-                    Ok(reader) => reader,
-                    Err(e) => {
-                        error!("Failed to guess image format: {e}");
-                        return None;
-                    }
-                };
+    let image = result
+        .as_ref()
+        .ok()?
+        .content
+        .iter()
+        .find_map(decode_mcp_image)?;

-                let image = match reader.decode() {
-                    Ok(image) => image,
-                    Err(e) => {
-                        error!("Image decoding failed: {e}");
-                        return None;
-                    }
-                };
+    Some(CompletedMcpToolCallWithImageOutput { _image: image })
+}

-                Some(CompletedMcpToolCallWithImageOutput { _image: image })
-            } else {
-                None
-            }
-        }
-        _ => None,
-    }
+/// Decodes an MCP `ImageContent` block into an in-memory image.
+///
+/// Returns `None` when the block is not an image, when base64 decoding fails, when the format
+/// cannot be inferred, or when the image decoder rejects the bytes.
+fn decode_mcp_image(block: &mcp_types::ContentBlock) -> Option<DynamicImage> {
+    let image = match block {
+        mcp_types::ContentBlock::ImageContent(image) => image,
+        _ => return None,
+    };
+    let raw_data = base64::engine::general_purpose::STANDARD
+        .decode(&image.data)
+        .map_err(|e| {
+            error!("Failed to decode image data: {e}");
+            e
+        })
+        .ok()?;
+    let reader = ImageReader::new(Cursor::new(raw_data))
+        .with_guessed_format()
+        .map_err(|e| {
+            error!("Failed to guess image format: {e}");
+            e
+        })
+        .ok()?;
+
+    reader
+        .decode()
+        .map_err(|e| {
+            error!("Image decoding failed: {e}");
+            e
+        })
+        .ok()
 }

 #[allow(clippy::disallowed_methods)]
@@ -1929,9 +1947,12 @@ mod tests {
    use codex_core::protocol::ExecCommandSource;
    use mcp_types::CallToolResult;
    use mcp_types::ContentBlock;
+    use mcp_types::ImageContent;
    use mcp_types::TextContent;
    use mcp_types::Tool;
    use mcp_types::ToolInputSchema;
+
+    const SMALL_PNG_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
    async fn test_config() -> Config {
        let codex_home = std::env::temp_dir();
        ConfigBuilder::default()
@@ -1957,6 +1978,15 @@ mod tests {
        render_lines(&cell.transcript_lines(u16::MAX))
    }

+    fn image_block(data: &str) -> ContentBlock {
+        ContentBlock::ImageContent(ImageContent {
+            annotations: None,
+            data: data.to_string(),
+            mime_type: "image/png".into(),
+            r#type: "image".into(),
+        })
+    }
+
    #[test]
    fn unified_exec_interaction_cell_renders_input() {
        let cell =
@@ -2251,6 +2281,63 @@ mod tests {
        insta::assert_snapshot!(rendered);
    }

+    #[test]
+    fn completed_mcp_tool_call_image_after_text_returns_extra_cell() {
+        let invocation = McpInvocation {
+            server: "image".into(),
+            tool: "generate".into(),
+            arguments: Some(json!({
+                "prompt": "tiny image",
+            })),
+        };
+
+        let result = CallToolResult {
+            content: vec![
+                ContentBlock::TextContent(TextContent {
+                    annotations: None,
+                    text: "Here is the image:".into(),
+                    r#type: "text".into(),
+                }),
+                image_block(SMALL_PNG_BASE64),
+            ],
+            is_error: None,
+            structured_content: None,
+        };
+
+        let mut cell = new_active_mcp_tool_call("call-image".into(), invocation, true);
+        let extra_cell = cell
+            .complete(Duration::from_millis(25), Ok(result))
+            .expect("expected image cell");
+
+        let rendered = render_lines(&extra_cell.display_lines(80));
+        assert_eq!(rendered, vec!["tool result (image output)"]);
+    }
+
+    #[test]
+    fn completed_mcp_tool_call_skips_invalid_image_blocks() {
+        let invocation = McpInvocation {
+            server: "image".into(),
+            tool: "generate".into(),
+            arguments: Some(json!({
+                "prompt": "tiny image",
+            })),
+        };
+
+        let result = CallToolResult {
+            content: vec![image_block("not-base64"), image_block(SMALL_PNG_BASE64)],
+            is_error: None,
+            structured_content: None,
+        };
+
+        let mut cell = new_active_mcp_tool_call("call-image-2".into(), invocation, true);
+        let extra_cell = cell
+            .complete(Duration::from_millis(25), Ok(result))
+            .expect("expected image cell");
+
+        let rendered = render_lines(&extra_cell.display_lines(80));
+        assert_eq!(rendered, vec!["tool result (image output)"]);
+    }
+
    #[test]
    fn completed_mcp_tool_call_error_snapshot() {
        let invocation = McpInvocation {