mirror of
https://github.com/openai/codex.git
synced 2026-05-03 02:46:39 +00:00
Terminate stdio MCP servers on shutdown to avoid process leaks (#19753)
## Why Several bug reports describe thread shutdown (including subagent threads) leaving stdio MCP server processes behind. These reports all point at the same lifecycle gap: Codex launches stdio MCP servers, but the session-level shutdown path does not explicitly close MCP clients or terminate the server process tree. Fixes #12491 Fixes #12976 Fixes #18881 Fixes #19469 ## History This is best understood as a regression/coverage gap in MCP session lifecycle management, not as stdio MCP cleanup being absent all along. #10710 added process-group cleanup for stdio MCP servers, but that cleanup only runs when the `RmcpClient`/transport is dropped. The older reports (#12491 and #12976) came after that cleanup existed, which suggests the remaining problem was that some higher-level shutdown paths kept the MCP manager alive or replaced it without explicitly draining clients. The newer reports (#18881 and #19469) exposed the same family around manager replacement and shutdown. ## What changed - Added an explicit stdio MCP process handle in `codex-rmcp-client` so local MCP servers terminate their process group and executor-backed MCP servers call the executor process terminator. - Added `RmcpClient::shutdown()` and manager-level MCP shutdown draining so session shutdown, channel-close fallback, MCP refresh, and connector probing stop owned MCP clients. - Added regression coverage that starts a stdio MCP server, begins an in-flight blocking tool call, shuts down the client, and asserts the server process exits. ## Verification - `cargo test -p codex-rmcp-client` - `cargo test -p codex-mcp` - `just fix -p codex-rmcp-client` - `just fix -p codex-mcp` - `just fix -p codex-core` - Manual before/after validation with a temporary repro script: - Pre-fix binary from `HEAD^` (`fed0a8f4fa`): reproduced the leak with surviving MCP server and child PIDs, `survivors=[77583, 77592]`, `leaked=true`. - Post-fix binary from this branch (`67e318148b`): verified both MCP processes were gone after interrupting `codex exec`, `survivors=[]`, `leaked=false`.
This commit is contained in:
@@ -9,8 +9,43 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use codex_rmcp_client::ElicitationAction;
|
||||
use codex_rmcp_client::ElicitationResponse;
|
||||
use codex_rmcp_client::LocalStdioServerLauncher;
|
||||
use codex_rmcp_client::RmcpClient;
|
||||
use futures::FutureExt as _;
|
||||
use rmcp::model::ClientCapabilities;
|
||||
use rmcp::model::Implementation;
|
||||
use rmcp::model::InitializeRequestParams;
|
||||
use rmcp::model::ProtocolVersion;
|
||||
use serde_json::json;
|
||||
|
||||
fn stdio_server_bin() -> Result<std::path::PathBuf> {
|
||||
codex_utils_cargo_bin::cargo_bin("test_stdio_server").map_err(Into::into)
|
||||
}
|
||||
|
||||
fn init_params() -> InitializeRequestParams {
|
||||
InitializeRequestParams {
|
||||
meta: None,
|
||||
capabilities: ClientCapabilities {
|
||||
experimental: None,
|
||||
extensions: None,
|
||||
roots: None,
|
||||
sampling: None,
|
||||
elicitation: None,
|
||||
tasks: None,
|
||||
},
|
||||
client_info: Implementation {
|
||||
name: "codex-test".into(),
|
||||
version: "0.0.0-test".into(),
|
||||
title: Some("Codex rmcp shutdown test".into()),
|
||||
description: None,
|
||||
icons: None,
|
||||
website_url: None,
|
||||
},
|
||||
protocol_version: ProtocolVersion::V_2025_06_18,
|
||||
}
|
||||
}
|
||||
|
||||
fn process_exists(pid: u32) -> bool {
|
||||
std::process::Command::new("kill")
|
||||
@@ -94,3 +129,67 @@ async fn drop_kills_wrapper_process_group() -> Result<()> {
|
||||
|
||||
wait_for_process_exit(grandchild_pid).await
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shutdown_kills_initialized_stdio_server_with_in_flight_operation() -> Result<()> {
|
||||
let temp_dir = tempfile::tempdir()?;
|
||||
let server_pid_file = temp_dir.path().join("server.pid");
|
||||
let server_pid_file_str = server_pid_file.to_string_lossy().into_owned();
|
||||
|
||||
let client = Arc::new(
|
||||
RmcpClient::new_stdio_client(
|
||||
stdio_server_bin()?.into(),
|
||||
Vec::<OsString>::new(),
|
||||
Some(HashMap::from([(
|
||||
OsString::from("MCP_TEST_PID_FILE"),
|
||||
OsString::from(server_pid_file_str),
|
||||
)])),
|
||||
&[],
|
||||
/*cwd*/ None,
|
||||
Arc::new(LocalStdioServerLauncher::new(std::env::current_dir()?)),
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
client
|
||||
.initialize(
|
||||
init_params(),
|
||||
Some(Duration::from_secs(5)),
|
||||
Box::new(|_, _| {
|
||||
async {
|
||||
Ok(ElicitationResponse {
|
||||
action: ElicitationAction::Accept,
|
||||
content: Some(json!({})),
|
||||
meta: None,
|
||||
})
|
||||
}
|
||||
.boxed()
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let server_pid = wait_for_pid_file(&server_pid_file).await?;
|
||||
assert!(
|
||||
process_exists(server_pid),
|
||||
"expected MCP server process {server_pid} to be running before shutdown"
|
||||
);
|
||||
|
||||
let call_client = Arc::clone(&client);
|
||||
let call_task = tokio::spawn(async move {
|
||||
call_client
|
||||
.call_tool(
|
||||
"sync".to_string(),
|
||||
Some(json!({ "sleep_after_ms": 300_000 })),
|
||||
/*meta*/ None,
|
||||
Some(Duration::from_secs(300)),
|
||||
)
|
||||
.await
|
||||
});
|
||||
tokio::time::sleep(Duration::from_millis(200)).await;
|
||||
|
||||
client.shutdown().await;
|
||||
|
||||
wait_for_process_exit(server_pid).await?;
|
||||
let _ = tokio::time::timeout(Duration::from_secs(5), call_task).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user