mirror of
https://github.com/openai/codex.git
synced 2026-05-03 10:56:37 +00:00
## Why Several bug reports describe thread shutdown (including subagent threads) leaving stdio MCP server processes behind. These reports all point at the same lifecycle gap: Codex launches stdio MCP servers, but the session-level shutdown path does not explicitly close MCP clients or terminate the server process tree. Fixes #12491 Fixes #12976 Fixes #18881 Fixes #19469 ## History This is best understood as a regression/coverage gap in MCP session lifecycle management, not as stdio MCP cleanup being absent all along. #10710 added process-group cleanup for stdio MCP servers, but that cleanup only runs when the `RmcpClient`/transport is dropped. The older reports (#12491 and #12976) came after that cleanup existed, which suggests the remaining problem was that some higher-level shutdown paths kept the MCP manager alive or replaced it without explicitly draining clients. The newer reports (#18881 and #19469) exposed the same family around manager replacement and shutdown. ## What changed - Added an explicit stdio MCP process handle in `codex-rmcp-client` so local MCP servers terminate their process group and executor-backed MCP servers call the executor process terminator. - Added `RmcpClient::shutdown()` and manager-level MCP shutdown draining so session shutdown, channel-close fallback, MCP refresh, and connector probing stop owned MCP clients. - Added regression coverage that starts a stdio MCP server, begins an in-flight blocking tool call, shuts down the client, and asserts the server process exits. ## Verification - `cargo test -p codex-rmcp-client` - `cargo test -p codex-mcp` - `just fix -p codex-rmcp-client` - `just fix -p codex-mcp` - `just fix -p codex-core` - Manual before/after validation with a temporary repro script: - Pre-fix binary from `HEAD^` (`fed0a8f4fa`): reproduced the leak with surviving MCP server and child PIDs, `survivors=[77583, 77592]`, `leaked=true`. - Post-fix binary from this branch (`67e318148b`): verified both MCP processes were gone after interrupting `codex exec`, `survivors=[]`, `leaked=false`.
196 lines
5.9 KiB
Rust
196 lines
5.9 KiB
Rust
#![cfg(unix)]
|
|
|
|
use std::collections::HashMap;
|
|
use std::ffi::OsString;
|
|
use std::fs;
|
|
use std::path::Path;
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
use anyhow::Context;
|
|
use anyhow::Result;
|
|
use codex_rmcp_client::ElicitationAction;
|
|
use codex_rmcp_client::ElicitationResponse;
|
|
use codex_rmcp_client::LocalStdioServerLauncher;
|
|
use codex_rmcp_client::RmcpClient;
|
|
use futures::FutureExt as _;
|
|
use rmcp::model::ClientCapabilities;
|
|
use rmcp::model::Implementation;
|
|
use rmcp::model::InitializeRequestParams;
|
|
use rmcp::model::ProtocolVersion;
|
|
use serde_json::json;
|
|
|
|
fn stdio_server_bin() -> Result<std::path::PathBuf> {
|
|
codex_utils_cargo_bin::cargo_bin("test_stdio_server").map_err(Into::into)
|
|
}
|
|
|
|
fn init_params() -> InitializeRequestParams {
|
|
InitializeRequestParams {
|
|
meta: None,
|
|
capabilities: ClientCapabilities {
|
|
experimental: None,
|
|
extensions: None,
|
|
roots: None,
|
|
sampling: None,
|
|
elicitation: None,
|
|
tasks: None,
|
|
},
|
|
client_info: Implementation {
|
|
name: "codex-test".into(),
|
|
version: "0.0.0-test".into(),
|
|
title: Some("Codex rmcp shutdown test".into()),
|
|
description: None,
|
|
icons: None,
|
|
website_url: None,
|
|
},
|
|
protocol_version: ProtocolVersion::V_2025_06_18,
|
|
}
|
|
}
|
|
|
|
fn process_exists(pid: u32) -> bool {
|
|
std::process::Command::new("kill")
|
|
.arg("-0")
|
|
.arg(pid.to_string())
|
|
.stderr(std::process::Stdio::null())
|
|
.status()
|
|
.map(|status| status.success())
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
async fn wait_for_pid_file(path: &Path) -> Result<u32> {
|
|
for _ in 0..50 {
|
|
match fs::read_to_string(path) {
|
|
Ok(content) => {
|
|
let trimmed = content.trim();
|
|
if trimmed.is_empty() {
|
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
|
continue;
|
|
}
|
|
|
|
let pid = trimmed
|
|
.parse::<u32>()
|
|
.with_context(|| format!("failed to parse pid from {}", path.display()))?;
|
|
return Ok(pid);
|
|
}
|
|
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
|
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
|
}
|
|
Err(error) => {
|
|
return Err(error).with_context(|| format!("failed to read {}", path.display()));
|
|
}
|
|
}
|
|
}
|
|
|
|
anyhow::bail!("timed out waiting for child pid file at {}", path.display());
|
|
}
|
|
|
|
async fn wait_for_process_exit(pid: u32) -> Result<()> {
|
|
for _ in 0..50 {
|
|
if !process_exists(pid) {
|
|
return Ok(());
|
|
}
|
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
|
}
|
|
|
|
anyhow::bail!("process {pid} still running after timeout");
|
|
}
|
|
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
|
async fn drop_kills_wrapper_process_group() -> Result<()> {
|
|
let temp_dir = tempfile::tempdir()?;
|
|
let child_pid_file = temp_dir.path().join("child.pid");
|
|
let child_pid_file_str = child_pid_file.to_string_lossy().into_owned();
|
|
|
|
let client = RmcpClient::new_stdio_client(
|
|
OsString::from("/bin/sh"),
|
|
vec![
|
|
OsString::from("-c"),
|
|
OsString::from(
|
|
"sleep 300 & child_pid=$!; echo \"$child_pid\" > \"$CHILD_PID_FILE\"; cat >/dev/null",
|
|
),
|
|
],
|
|
Some(HashMap::from([(
|
|
OsString::from("CHILD_PID_FILE"),
|
|
OsString::from(child_pid_file_str),
|
|
)])),
|
|
&[],
|
|
/*cwd*/ None,
|
|
Arc::new(LocalStdioServerLauncher::new(std::env::current_dir()?)),
|
|
)
|
|
.await?;
|
|
|
|
let grandchild_pid = wait_for_pid_file(&child_pid_file).await?;
|
|
assert!(
|
|
process_exists(grandchild_pid),
|
|
"expected grandchild process {grandchild_pid} to be running before dropping client"
|
|
);
|
|
|
|
drop(client);
|
|
|
|
wait_for_process_exit(grandchild_pid).await
|
|
}
|
|
|
|
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
|
async fn shutdown_kills_initialized_stdio_server_with_in_flight_operation() -> Result<()> {
|
|
let temp_dir = tempfile::tempdir()?;
|
|
let server_pid_file = temp_dir.path().join("server.pid");
|
|
let server_pid_file_str = server_pid_file.to_string_lossy().into_owned();
|
|
|
|
let client = Arc::new(
|
|
RmcpClient::new_stdio_client(
|
|
stdio_server_bin()?.into(),
|
|
Vec::<OsString>::new(),
|
|
Some(HashMap::from([(
|
|
OsString::from("MCP_TEST_PID_FILE"),
|
|
OsString::from(server_pid_file_str),
|
|
)])),
|
|
&[],
|
|
/*cwd*/ None,
|
|
Arc::new(LocalStdioServerLauncher::new(std::env::current_dir()?)),
|
|
)
|
|
.await?,
|
|
);
|
|
|
|
client
|
|
.initialize(
|
|
init_params(),
|
|
Some(Duration::from_secs(5)),
|
|
Box::new(|_, _| {
|
|
async {
|
|
Ok(ElicitationResponse {
|
|
action: ElicitationAction::Accept,
|
|
content: Some(json!({})),
|
|
meta: None,
|
|
})
|
|
}
|
|
.boxed()
|
|
}),
|
|
)
|
|
.await?;
|
|
|
|
let server_pid = wait_for_pid_file(&server_pid_file).await?;
|
|
assert!(
|
|
process_exists(server_pid),
|
|
"expected MCP server process {server_pid} to be running before shutdown"
|
|
);
|
|
|
|
let call_client = Arc::clone(&client);
|
|
let call_task = tokio::spawn(async move {
|
|
call_client
|
|
.call_tool(
|
|
"sync".to_string(),
|
|
Some(json!({ "sleep_after_ms": 300_000 })),
|
|
/*meta*/ None,
|
|
Some(Duration::from_secs(300)),
|
|
)
|
|
.await
|
|
});
|
|
tokio::time::sleep(Duration::from_millis(200)).await;
|
|
|
|
client.shutdown().await;
|
|
|
|
wait_for_process_exit(server_pid).await?;
|
|
let _ = tokio::time::timeout(Duration::from_secs(5), call_task).await?;
|
|
Ok(())
|
|
}
|