Compare commits

...

1 Commits

Author SHA1 Message Date
Aismit Das
3b7f478f10 Materialize MCP tool images into local refs 2026-03-16 17:58:42 -04:00
3 changed files with 270 additions and 33 deletions

View File

@@ -1,12 +1,16 @@
use std::collections::BTreeMap;
use std::path::Path;
use std::time::Duration;
use std::time::Instant;
use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_app_server_protocol::McpElicitationObjectType;
use codex_app_server_protocol::McpElicitationSchema;
use codex_app_server_protocol::McpServerElicitationRequest;
use codex_app_server_protocol::McpServerElicitationRequestParams;
use tracing::error;
use uuid::Uuid;
use crate::analytics_client::AppInvocation;
use crate::analytics_client::InvocationType;
@@ -47,10 +51,11 @@ use codex_rmcp_client::ElicitationAction;
use codex_rmcp_client::ElicitationResponse;
use rmcp::model::ToolAnnotations;
use serde::Serialize;
use std::path::Path;
use std::sync::Arc;
use toml_edit::value;
const MCP_TOOL_IMAGE_DIR_NAME: &str = "codex-mcp-tool-images";
/// Handles the specified tool call dispatches the appropriate
/// `McpToolCallBegin` and `McpToolCallEnd` events to the `Session`.
pub(crate) async fn handle_mcp_tool_call(
@@ -80,6 +85,7 @@ pub(crate) async fn handle_mcp_tool_call(
tool: tool_name.clone(),
arguments: arguments_value.clone(),
};
let image_dir = std::env::temp_dir().join(MCP_TOOL_IMAGE_DIR_NAME);
let metadata =
lookup_mcp_tool_metadata(sess.as_ref(), turn_context.as_ref(), &server, &tool_name).await;
@@ -152,6 +158,8 @@ pub(crate) async fn handle_mcp_tool_call(
.await
.map_err(|e| format!("tool call error: {e:?}"));
let result = sanitize_mcp_tool_result_for_model(
&image_dir,
&call_id,
turn_context
.model_info
.input_modalities
@@ -236,6 +244,8 @@ pub(crate) async fn handle_mcp_tool_call(
.await
.map_err(|e| format!("tool call error: {e:?}"));
let result = sanitize_mcp_tool_result_for_model(
&image_dir,
&call_id,
turn_context
.model_info
.input_modalities
@@ -285,36 +295,175 @@ async fn maybe_mark_thread_memory_mode_polluted(sess: &Session, turn_context: &T
}
fn sanitize_mcp_tool_result_for_model(
image_dir: &Path,
call_id: &str,
supports_image_input: bool,
result: Result<CallToolResult, String>,
) -> Result<CallToolResult, String> {
if supports_image_input {
return result;
}
result.map(|call_tool_result| CallToolResult {
content: call_tool_result
.content
.iter()
.map(|block| {
if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
&& content_type == "image"
{
return serde_json::json!({
"type": "text",
"text": "<image content omitted because you do not support image input>",
});
}
content: if supports_image_input {
call_tool_result
.content
.iter()
.enumerate()
.map(|(index, block)| {
materialize_mcp_image_block(image_dir, call_id, index, block)
.unwrap_or_else(|| block.clone())
})
.collect()
} else {
call_tool_result
.content
.iter()
.map(|block| {
if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
&& content_type == "image"
{
return serde_json::json!({
"type": "text",
"text": "<image content omitted because you do not support image input>",
});
}
block.clone()
})
.collect::<Vec<_>>(),
block.clone()
})
.collect()
},
structured_content: call_tool_result.structured_content,
is_error: call_tool_result.is_error,
meta: call_tool_result.meta,
})
}
fn materialize_mcp_image_block(
image_dir: &Path,
call_id: &str,
index: usize,
block: &serde_json::Value,
) -> Option<serde_json::Value> {
let content_type = block.get("type").and_then(serde_json::Value::as_str)?;
if content_type != "image" {
return None;
}
let data = block.get("data").and_then(serde_json::Value::as_str)?;
let mime_type = block
.get("mimeType")
.and_then(serde_json::Value::as_str)
.or_else(|| block.get("mime_type").and_then(serde_json::Value::as_str));
let (bytes, resolved_mime_type) = match decode_mcp_image_payload(data, mime_type) {
Ok(decoded) => decoded,
Err(err) => {
tracing::warn!(
call_id,
index,
"failed to decode MCP image payload for local materialization: {err}"
);
return None;
}
};
if let Err(err) = std::fs::create_dir_all(image_dir) {
tracing::warn!(
call_id,
index,
image_dir = %image_dir.display(),
"failed to create MCP image directory: {err}"
);
return None;
}
let sanitized_call_id = call_id
.chars()
.map(|ch| {
if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
ch
} else {
'_'
}
})
.collect::<String>();
let file_stem = if sanitized_call_id.is_empty() {
"mcp_tool_image".to_string()
} else {
sanitized_call_id
};
let extension = image_extension_for_mime(&resolved_mime_type);
let path = image_dir.join(format!(
"{file_stem}-{}-{}.{}",
index + 1,
Uuid::new_v4(),
extension
));
if let Err(err) = std::fs::write(&path, bytes) {
tracing::warn!(
call_id,
index,
path = %path.display(),
"failed to persist MCP image payload: {err}"
);
return None;
}
Some(serde_json::json!({
"type": "local_image",
"path": path.to_string_lossy(),
}))
}
fn decode_mcp_image_payload(
data: &str,
mime_type: Option<&str>,
) -> Result<(Vec<u8>, String), String> {
if let Some(comma_index) = data.find(',')
&& data
.get(..5)
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
{
let metadata = &data[5..comma_index];
let payload = &data[comma_index + 1..];
let mut metadata_parts = metadata.split(';');
let data_mime_type = metadata_parts.next().unwrap_or_default();
let has_base64_marker = metadata_parts.any(|part| part.eq_ignore_ascii_case("base64"));
if !has_base64_marker {
return Err("non-base64 data URLs are not supported".to_string());
}
let bytes = BASE64_STANDARD
.decode(payload.trim().as_bytes())
.map_err(|err| format!("invalid base64 image data: {err}"))?;
let resolved_mime_type = if data_mime_type.is_empty() {
mime_type.unwrap_or("application/octet-stream").to_string()
} else {
data_mime_type.to_string()
};
return Ok((bytes, resolved_mime_type));
}
let bytes = BASE64_STANDARD
.decode(data.trim().as_bytes())
.map_err(|err| format!("invalid base64 image data: {err}"))?;
Ok((
bytes,
mime_type.unwrap_or("application/octet-stream").to_string(),
))
}
fn image_extension_for_mime(mime_type: &str) -> &'static str {
match mime_type {
"image/jpeg" | "image/jpg" => "jpg",
"image/png" => "png",
"image/webp" => "webp",
"image/gif" => "gif",
"image/svg+xml" => "svg",
"image/bmp" => "bmp",
"image/tiff" => "tiff",
_ => "bin",
}
}
async fn notify_mcp_tool_call_event(sess: &Session, turn_context: &TurnContext, event: EventMsg) {
sess.send_event(turn_context, event).await;
}

View File

@@ -386,6 +386,7 @@ fn codex_apps_connectors_support_persistent_approval() {
#[test]
fn sanitize_mcp_tool_result_for_model_rewrites_image_content() {
let temp = tempdir().expect("tempdir");
let result = Ok(CallToolResult {
content: vec![
serde_json::json!({
@@ -403,7 +404,8 @@ fn sanitize_mcp_tool_result_for_model_rewrites_image_content() {
meta: None,
});
let got = sanitize_mcp_tool_result_for_model(false, result).expect("sanitized result");
let got = sanitize_mcp_tool_result_for_model(temp.path(), "call-1", false, result)
.expect("sanitized result");
assert_eq!(
got.content,
@@ -421,11 +423,12 @@ fn sanitize_mcp_tool_result_for_model_rewrites_image_content() {
}
#[test]
fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
fn sanitize_mcp_tool_result_for_model_materializes_image_when_supported() {
let temp = tempdir().expect("tempdir");
let original = CallToolResult {
content: vec![serde_json::json!({
"type": "image",
"data": "Zm9v",
"data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII=",
"mimeType": "image/png",
})],
structured_content: Some(serde_json::json!({"x": 1})),
@@ -433,10 +436,24 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
meta: Some(serde_json::json!({"k": "v"})),
};
let got =
sanitize_mcp_tool_result_for_model(true, Ok(original.clone())).expect("unsanitized result");
let got = sanitize_mcp_tool_result_for_model(temp.path(), "call:1", true, Ok(original))
.expect("sanitized result");
assert_eq!(got, original);
assert_eq!(got.structured_content, Some(serde_json::json!({"x": 1})));
assert_eq!(got.is_error, Some(false));
assert_eq!(got.meta, Some(serde_json::json!({"k": "v"})));
let path = got.content[0]
.get("path")
.and_then(serde_json::Value::as_str)
.expect("local image path");
assert_eq!(
got.content[0]
.get("type")
.and_then(serde_json::Value::as_str),
Some("local_image")
);
assert!(path.starts_with(temp.path().to_string_lossy().as_ref()));
assert!(std::path::Path::new(path).is_file());
}
#[test]

View File

@@ -1439,6 +1439,8 @@ fn convert_mcp_content_to_items(
#[serde(rename = "mimeType", alias = "mime_type")]
mime_type: Option<String>,
},
#[serde(rename = "local_image", alias = "localImage")]
LocalImage { path: String },
#[serde(other)]
Unknown,
}
@@ -1447,8 +1449,10 @@ fn convert_mcp_content_to_items(
let mut items = Vec::with_capacity(contents.len());
for content in contents {
let item = match serde_json::from_value::<McpContent>(content.clone()) {
Ok(McpContent::Text { text }) => FunctionCallOutputContentItem::InputText { text },
let content_items = match serde_json::from_value::<McpContent>(content.clone()) {
Ok(McpContent::Text { text }) => {
vec![FunctionCallOutputContentItem::InputText { text }]
}
Ok(McpContent::Image { data, mime_type }) => {
saw_image = true;
let image_url = if data.starts_with("data:") {
@@ -1457,16 +1461,22 @@ fn convert_mcp_content_to_items(
let mime_type = mime_type.unwrap_or_else(|| "application/octet-stream".into());
format!("data:{mime_type};base64,{data}")
};
FunctionCallOutputContentItem::InputImage {
vec![FunctionCallOutputContentItem::InputImage {
image_url,
detail: None,
}
}]
}
Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText {
Ok(McpContent::LocalImage { path }) => {
saw_image = true;
vec![FunctionCallOutputContentItem::InputText {
text: format!("Image available at local path `{path}`."),
}]
}
Ok(McpContent::Unknown) | Err(_) => vec![FunctionCallOutputContentItem::InputText {
text: serde_json::to_string(content).unwrap_or_else(|_| "<content>".to_string()),
},
}],
};
items.push(item);
items.extend(content_items);
}
if saw_image { Some(items) } else { None }
@@ -2424,6 +2434,67 @@ mod tests {
Ok(())
}
#[test]
fn serializes_local_image_outputs_as_array() -> Result<()> {
let dir = tempdir()?;
let local_path = dir.path().join("local.png");
const TINY_PNG_BYTES: &[u8] = &[
137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82, 0, 0, 0, 1, 0, 0, 0, 1,
8, 6, 0, 0, 0, 31, 21, 196, 137, 0, 0, 0, 11, 73, 68, 65, 84, 120, 156, 99, 96, 0, 2,
0, 0, 5, 0, 1, 122, 94, 171, 63, 0, 0, 0, 0, 73, 69, 78, 68, 174, 66, 96, 130,
];
std::fs::write(&local_path, TINY_PNG_BYTES)?;
let call_tool_result = CallToolResult {
content: vec![serde_json::json!({
"type": "local_image",
"path": local_path,
})],
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
let Some(items) = payload.content_items() else {
panic!("expected content items");
};
assert!(matches!(
items,
[FunctionCallOutputContentItem::InputText { text }]
if text.contains("Image available at local path")
&& text.contains("local.png")
));
Ok(())
}
#[test]
fn local_image_output_preserves_local_path_reference() -> Result<()> {
let call_tool_result = CallToolResult {
content: vec![serde_json::json!({
"type": "local_image",
"path": "/tmp/does-not-exist.png",
})],
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
let Some(items) = payload.content_items() else {
panic!("expected content items");
};
assert!(matches!(
items,
[FunctionCallOutputContentItem::InputText { text }]
if text.contains("Image available at local path")
&& text.contains("/tmp/does-not-exist.png")
));
Ok(())
}
#[test]
fn deserializes_array_payload_into_items() -> Result<()> {
let json = r#"[