fix(core): instrument stalled tool-listing handoff (#24667)

## Why

When a turn needs a follow-up request after tool output is recorded,
Codex can still appear stuck in `Thinking` before the next `/responses`
request is opened. The existing local trace showed the last completed
response and the absence of a new backend request, but it did not show
whether the stall was in tool-router preparation or later request setup.

Issue: N/A (internal incident investigation)

## What Changed

Added trace spans around the pre-stream tool-router handoff in
`core/src/session/turn.rs`, including the `built_tools` phase and the
MCP manager read lock.

Added per-server MCP tool-listing spans and trace breadcrumbs in
`codex-mcp/src/connection_manager.rs` with startup snapshot /
startup-complete state so a pending MCP client is visible in feedback
logs instead of looking like a silent hang.

## Verification

- `just fmt`
- `just test -p codex-mcp`
- `just test -p codex-core` (prior full rerun fails in this workspace on
unrelated integration tests: code-mode output length expectations, one
shell timeout formatting assertion, and shell snapshot timeouts; latest
review-fix rerun compiled and passed 1160 tests before I stopped the
abnormally slow unrelated suite)
This commit is contained in:
Anton Panasenko
2026-05-26 19:00:40 -07:00
committed by GitHub
parent 9fe55d68e6
commit 64e340ad28
2 changed files with 44 additions and 4 deletions

View File

@@ -64,7 +64,10 @@ use rmcp::model::Resource;
use rmcp::model::ResourceTemplate;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use tracing::instrument;
use tracing::trace;
use tracing::trace_span;
use tracing::warn;
/// A thin wrapper around a set of running [`RmcpClient`] instances.
@@ -378,13 +381,37 @@ impl McpConnectionManager {
}
/// Returns all tools with model-visible names normalized.
#[instrument(level = "trace", skip_all)]
#[instrument(level = "trace", skip_all, fields(mcp_server_count = self.clients.len()))]
pub async fn list_all_tools(&self) -> Vec<ToolInfo> {
let mut tools = Vec::new();
for managed_client in self.clients.values() {
let Some(server_tools) = managed_client.listed_tools().await else {
for (server_name, managed_client) in &self.clients {
let has_startup_snapshot = managed_client.startup_snapshot.is_some();
let startup_complete = managed_client
.startup_complete
.load(std::sync::atomic::Ordering::Acquire);
trace!(
server_name = %server_name,
has_startup_snapshot,
startup_complete,
"waiting for MCP server tools while building tool list"
);
let Some(server_tools) = managed_client
.listed_tools()
.instrument(trace_span!(
"list_tools_for_server",
server_name = %server_name,
has_startup_snapshot,
startup_complete
))
.await
else {
continue;
};
trace!(
server_name = %server_name,
tool_count = server_tools.len(),
"listed MCP server tools while building tool list"
);
tools.extend(
server_tools
.into_iter()

View File

@@ -1006,12 +1006,25 @@ async fn run_sampling_request(
clippy::await_holding_invalid_type,
reason = "tool router construction reads through the session-owned manager guard"
)]
#[instrument(level = "trace",
skip_all,
fields(
turn_id = %turn_context.sub_id,
model = %turn_context.model_info.slug,
apps_enabled = turn_context.apps_enabled()
)
)]
pub(crate) async fn built_tools(
sess: &Session,
turn_context: &TurnContext,
cancellation_token: &CancellationToken,
) -> CodexResult<Arc<ToolRouter>> {
let mcp_connection_manager = sess.services.mcp_connection_manager.read().await;
let mcp_connection_manager = sess
.services
.mcp_connection_manager
.read()
.instrument(trace_span!("read_mcp_connection_manager"))
.await;
let has_mcp_servers = mcp_connection_manager.has_servers();
let all_mcp_tools = mcp_connection_manager
.list_all_tools()