Sanitize MCP image output for text-only models (#11346)

- Replace image blocks in MCP tool results with a text placeholder when
the active model does not accept image input.
- Add an e2e rmcp test to verify sanitized tool output is what gets sent
back to the model.
This commit is contained in:
Ahmed Ibrahim
2026-02-10 11:25:32 -08:00
committed by GitHub
parent 6e96e4837e
commit 9c4656000f
2 changed files with 274 additions and 2 deletions

View File

@@ -8,8 +8,11 @@ use std::time::Duration;
use std::time::SystemTime;
use std::time::UNIX_EPOCH;
use codex_core::CodexAuth;
use codex_core::config::types::McpServerConfig;
use codex_core::config::types::McpServerTransportConfig;
use codex_core::features::Feature;
use codex_core::models_manager::manager::RefreshStrategy;
use codex_core::protocol::AskForApproval;
use codex_core::protocol::EventMsg;
@@ -18,9 +21,17 @@ use codex_core::protocol::McpToolCallBeginEvent;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
use codex_protocol::openai_models::ConfigShellToolType;
use codex_protocol::openai_models::InputModality;
use codex_protocol::openai_models::ModelInfo;
use codex_protocol::openai_models::ModelVisibility;
use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::openai_models::ReasoningEffortPreset;
use codex_protocol::openai_models::TruncationPolicyConfig;
use codex_protocol::user_input::UserInput;
use codex_utils_cargo_bin::cargo_bin;
use core_test_support::responses;
use core_test_support::responses::mount_models_once;
use core_test_support::responses::mount_sse_once;
use core_test_support::skip_if_no_network;
use core_test_support::stdio_server_bin;
@@ -356,6 +367,166 @@ async fn stdio_image_responses_round_trip() -> anyhow::Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let call_id = "img-text-only-1";
let server_name = "rmcp";
let tool_name = format!("mcp__{server_name}__image");
let text_only_model_slug = "rmcp-text-only-model";
let models_mock = mount_models_once(
&server,
ModelsResponse {
models: vec![ModelInfo {
slug: text_only_model_slug.to_string(),
display_name: "RMCP Text Only".to_string(),
description: Some("Test model without image input support".to_string()),
default_reasoning_level: None,
supported_reasoning_levels: vec![ReasoningEffortPreset {
effort: codex_protocol::openai_models::ReasoningEffort::Medium,
description: "Medium".to_string(),
}],
shell_type: ConfigShellToolType::Default,
visibility: ModelVisibility::List,
supported_in_api: true,
priority: 1,
upgrade: None,
base_instructions: "base instructions".to_string(),
model_messages: None,
supports_reasoning_summaries: false,
support_verbosity: false,
default_verbosity: None,
apply_patch_tool_type: None,
truncation_policy: TruncationPolicyConfig::bytes(10_000),
supports_parallel_tool_calls: false,
context_window: Some(272_000),
auto_compact_token_limit: None,
effective_context_window_percent: 95,
experimental_supported_tools: Vec::new(),
input_modalities: vec![InputModality::Text],
}],
},
)
.await;
// First stream: model decides to call the image tool.
mount_sse_once(
&server,
responses::sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, &tool_name, "{}"),
responses::ev_completed("resp-1"),
]),
)
.await;
// Second stream: after tool execution, assistant emits a message and completes.
let final_mock = mount_sse_once(
&server,
responses::sse(vec![
responses::ev_assistant_message("msg-1", "rmcp image tool completed successfully."),
responses::ev_completed("resp-2"),
]),
)
.await;
let rmcp_test_server_bin = stdio_server_bin()?;
let fixture = test_codex()
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
.with_config(move |config| {
config.features.enable(Feature::RemoteModels);
let mut servers = config.mcp_servers.get().clone();
servers.insert(
server_name.to_string(),
McpServerConfig {
transport: McpServerTransportConfig::Stdio {
command: rmcp_test_server_bin,
args: Vec::new(),
env: Some(HashMap::from([(
"MCP_TEST_IMAGE_DATA_URL".to_string(),
OPENAI_PNG.to_string(),
)])),
env_vars: Vec::new(),
cwd: None,
},
enabled: true,
required: false,
disabled_reason: None,
startup_timeout_sec: Some(Duration::from_secs(10)),
tool_timeout_sec: None,
enabled_tools: None,
disabled_tools: None,
scopes: None,
},
);
config
.mcp_servers
.set(servers)
.expect("test mcp servers should accept any configuration");
})
.build(&server)
.await?;
fixture
.thread_manager
.get_models_manager()
.list_models(&fixture.config, RefreshStrategy::Online)
.await;
assert_eq!(models_mock.requests().len(), 1);
fixture
.codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "call the rmcp image tool".into(),
text_elements: Vec::new(),
}],
final_output_json_schema: None,
cwd: fixture.cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::ReadOnly,
model: text_only_model_slug.to_string(),
effort: None,
summary: ReasoningSummary::Auto,
collaboration_mode: None,
personality: None,
})
.await?;
wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallBegin(_))
})
.await;
wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallEnd(_))
})
.await;
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
let output_item = final_mock.single_request().function_call_output(call_id);
let output_text = output_item
.get("output")
.and_then(Value::as_str)
.expect("function_call_output output should be a JSON string");
let output_json: Value = serde_json::from_str(output_text)
.expect("function_call_output output should be valid JSON");
assert_eq!(
output_json,
json!([{
"type": "text",
"text": "<image content omitted because you do not support image input>"
}])
);
server.verify().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {