[code mode] defer mcp tools from exec description (#17287)

## Summary
- hide deferred MCP/app nested tool descriptions from the `exec` prompt
in code mode
- add short guidance that omitted nested tools are still available
through `ALL_TOOLS`
- cover the code_mode_only path with an integration test that discovers
and calls a deferred app tool

## Motivation
`code_mode_only` exposes only top-level `exec`/`wait`, but the `exec`
description could still include a large nested-tool reference. This
keeps deferred nested tools callable while avoiding that prompt bloat.

## Tests
- `just fmt`
- `just fix -p codex-code-mode`
- `just fix -p codex-tools`
- `cargo test -p codex-code-mode
exec_description_mentions_deferred_nested_tools_when_available`
- `cargo test -p codex-tools
create_code_mode_tool_matches_expected_spec`
- `cargo test -p codex-core
code_mode_only_guides_all_tools_search_and_calls_deferred_app_tools`
This commit is contained in:
sayan-oai
2026-04-17 00:01:14 +08:00
committed by GitHub
parent 8475d51655
commit 9c6d038622
5 changed files with 176 additions and 13 deletions

View File

@@ -6,6 +6,8 @@ use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_config::types::McpServerConfig;
use codex_config::types::McpServerTransportConfig;
use codex_features::Feature;
use codex_login::CodexAuth;
use codex_models_manager::bundled_models_response;
use codex_protocol::dynamic_tools::DynamicToolCallOutputContentItem;
use codex_protocol::dynamic_tools::DynamicToolResponse;
use codex_protocol::dynamic_tools::DynamicToolSpec;
@@ -14,6 +16,7 @@ use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::Op;
use codex_protocol::protocol::SandboxPolicy;
use codex_protocol::user_input::UserInput;
use core_test_support::apps_test_server::AppsTestServer;
use core_test_support::assert_regex_match;
use core_test_support::responses;
use core_test_support::responses::ResponseMock;
@@ -347,6 +350,131 @@ async fn code_mode_only_restricts_prompt_tools() -> Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_only_guides_all_tools_search_and_calls_deferred_app_tools() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let apps_server = AppsTestServer::mount_searchable(&server).await?;
let resp_mock = responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call(
"call-1",
"exec",
r#"
const tool = ALL_TOOLS.find(
({ name }) => name === "mcp__codex_apps__calendar_timezone_option_99"
);
if (!tool) {
text(JSON.stringify({ found: false }));
} else {
const result = await tools[tool.name]({ timezone: "UTC" });
text(JSON.stringify({
found: true,
isError: Boolean(result.isError),
text: result.content?.[0]?.text ?? "",
}));
}
"#,
),
ev_completed("resp-1"),
]),
)
.await;
let follow_up_mock = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
)
.await;
let apps_base_url = apps_server.chatgpt_base_url.clone();
let mut builder = test_codex()
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
.with_config(move |config| {
config
.features
.enable(Feature::Apps)
.expect("test config should allow feature update");
config
.features
.enable(Feature::ToolSearch)
.expect("test config should allow feature update");
config
.features
.enable(Feature::CodeMode)
.expect("test config should allow feature update");
config
.features
.enable(Feature::CodeModeOnly)
.expect("test config should allow feature update");
config.chatgpt_base_url = apps_base_url;
config.model = Some("gpt-5-codex".to_string());
let mut model_catalog = bundled_models_response()
.unwrap_or_else(|err| panic!("bundled models.json should parse: {err}"));
let model = model_catalog
.models
.iter_mut()
.find(|model| model.slug == "gpt-5-codex")
.expect("gpt-5-codex exists in bundled models.json");
model.supports_search_tool = true;
config.model_catalog = Some(model_catalog);
});
let test = builder.build(&server).await?;
test.submit_turn("inspect tools in code mode only").await?;
let first_body = resp_mock.single_request().body_json();
assert_eq!(
tool_names(&first_body),
vec!["exec".to_string(), "wait".to_string()]
);
let exec_description = first_body
.get("tools")
.and_then(Value::as_array)
.and_then(|tools| {
tools.iter().find_map(|tool| {
if tool
.get("name")
.or_else(|| tool.get("type"))
.and_then(Value::as_str)
== Some("exec")
{
tool.get("description").and_then(Value::as_str)
} else {
None
}
})
})
.expect("exec description should be present");
assert!(exec_description.contains("filter `ALL_TOOLS` by `name` and `description`"));
assert!(!exec_description.contains("calendar_timezone_option_99"));
let request = follow_up_mock.single_request();
let (output, success) = custom_tool_output_body_and_success(&request, "call-1");
assert_ne!(
success,
Some(false),
"code_mode_only deferred app tool call failed unexpectedly: {output}"
);
let parsed: Value = serde_json::from_str(&output)?;
assert_eq!(
parsed,
serde_json::json!({
"found": true,
"isError": false,
"text": "called calendar_timezone_option_99 for at with ",
})
);
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_only_can_call_nested_tools() -> Result<()> {