[codex] Add search term coverage for tool_search (#22398)

## Why

`tool_search` already had solid end-to-end coverage for discovery and
follow-up execution, but it did not prove that distinct pieces of
indexed search text actually work in integration. In particular, we were
not exercising whether unique tool names, descriptions, namespaces,
underscore-expanded dynamic names, and schema-property terms were
sufficient to surface the expected deferred tools.

This change adds focused integration coverage for those term sources so
regressions in search text construction are caught by a real `TestCodex`
flow instead of only by lower-level unit tests.

## What changed

- added a small helper in `core/tests/suite/search_tool.rs` to assert
that a `tool_search_output` contains an expected namespace child tool
- added an MCP integration test that issues several `tool_search_call`s
and verifies distinct query terms match the expected app tools:
  - exact tool name: `calendar_timezone_option_99`
  - tool description phrase: `uploaded document`
  - top-level schema property: `starts_at`
- added a dynamic-tool integration test that verifies distinct query
terms match the expected deferred dynamic tool:
  - exact name: `quasar_ping_beacon`
  - underscore-expanded name: `quasar ping beacon`
  - description phrase: `saffron metronome`
  - namespace: `orbit_ops`
  - schema property: `chrono_spec`

## Validation

- `cargo test -p codex-core tool_search_matches_`

## Docs

No documentation update needed.
This commit is contained in:
pakrym-oai
2026-05-12 17:24:07 -07:00
committed by GitHub
parent 9e7cdbd0d2
commit f11ad1eacb

View File

@@ -98,6 +98,18 @@ fn tool_search_output_tools(request: &ResponsesRequest, call_id: &str) -> Vec<Va
.unwrap_or_default()
}
fn tool_search_output_has_namespace_child(
request: &ResponsesRequest,
call_id: &str,
namespace: &str,
tool_name: &str,
) -> bool {
let output = json!({
"tools": tool_search_output_tools(request, call_id),
});
namespace_child_tool(&output, namespace, tool_name).is_some()
}
fn configure_search_capable_model(config: &mut Config) {
let mut model_catalog = bundled_models_response()
.unwrap_or_else(|err| panic!("bundled models.json should parse: {err}"));
@@ -1138,3 +1150,195 @@ async fn tool_search_uses_non_app_mcp_server_instructions_as_namespace_descripti
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_search_matches_mcp_tools_by_distinct_name_description_and_schema_terms() -> Result<()>
{
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let apps_server = AppsTestServer::mount_searchable(&server).await?;
let query_cases = [
("tool-search-mcp-raw-name", "calendar_timezone_option_99"),
("tool-search-mcp-description", "uploaded document"),
("tool-search-mcp-schema", "starts_at"),
];
let mock = mount_sse_sequence(
&server,
vec![
sse(std::iter::once(ev_response_created("resp-1"))
.chain(query_cases.into_iter().map(|(call_id, query)| {
ev_tool_search_call(
call_id,
&json!({
"query": query,
"limit": 8,
}),
)
}))
.chain(std::iter::once(ev_completed("resp-1")))
.collect()),
sse(vec![
ev_response_created("resp-2"),
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
],
)
.await;
let mut builder = configured_builder(apps_server.chatgpt_base_url.clone());
let test = builder.build(&server).await?;
test.submit_turn_with_approval_and_permission_profile(
"Search for calendar tooling.",
AskForApproval::Never,
PermissionProfile::Disabled,
)
.await?;
let requests = mock.requests();
assert_eq!(requests.len(), 2);
assert!(
tool_search_output_has_namespace_child(
&requests[1],
"tool-search-mcp-raw-name",
SEARCH_CALENDAR_NAMESPACE,
"_timezone_option_99"
),
"expected raw MCP tool-name query to surface _timezone_option_99: {:?}",
tool_search_output_tools(&requests[1], "tool-search-mcp-raw-name")
);
assert!(
tool_search_output_has_namespace_child(
&requests[1],
"tool-search-mcp-description",
SEARCH_CALENDAR_NAMESPACE,
"_extract_text"
),
"expected MCP description query to surface _extract_text: {:?}",
tool_search_output_tools(&requests[1], "tool-search-mcp-description")
);
assert!(
tool_search_output_has_namespace_child(
&requests[1],
"tool-search-mcp-schema",
SEARCH_CALENDAR_NAMESPACE,
SEARCH_CALENDAR_CREATE_TOOL
),
"expected MCP schema query to surface {SEARCH_CALENDAR_CREATE_TOOL}: {:?}",
tool_search_output_tools(&requests[1], "tool-search-mcp-schema")
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_search_matches_dynamic_tools_by_name_description_namespace_and_schema_terms()
-> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let query_cases = [
("tool-search-dynamic-name", "quasar_ping_beacon"),
("tool-search-dynamic-spaces", "quasar ping beacon"),
("tool-search-dynamic-description", "saffron metronome"),
("tool-search-dynamic-namespace", "orbit_ops"),
("tool-search-dynamic-schema", "chrono_spec"),
];
let mock = mount_sse_sequence(
&server,
vec![
sse(std::iter::once(ev_response_created("resp-1"))
.chain(query_cases.into_iter().map(|(call_id, query)| {
ev_tool_search_call(
call_id,
&json!({
"query": query,
"limit": 8,
}),
)
}))
.chain(std::iter::once(ev_completed("resp-1")))
.collect()),
sse(vec![
ev_response_created("resp-2"),
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
],
)
.await;
let dynamic_tool = DynamicToolSpec {
namespace: Some("orbit_ops".to_string()),
name: "quasar_ping_beacon".to_string(),
description: "Trigger the saffron metronome workflow for reminder follow-ups.".to_string(),
input_schema: json!({
"type": "object",
"properties": {
"chrono_spec": { "type": "string" },
"targetThreadId": { "type": "string" },
},
"required": ["chrono_spec"],
"additionalProperties": false,
}),
defer_loading: true,
};
let mut builder = test_codex().with_config(configure_search_capable_model);
let base_test = builder.build(&server).await?;
let new_thread = base_test
.thread_manager
.start_thread_with_tools(
base_test.config.clone(),
vec![dynamic_tool],
/*persist_extended_history*/ false,
)
.await?;
let mut test = base_test;
test.codex = new_thread.thread;
test.session_configured = new_thread.session_configured;
test.codex
.submit(Op::UserInput {
environments: None,
items: vec![UserInput::Text {
text: "Search for the dynamic tool".to_string(),
text_elements: Vec::new(),
}],
final_output_json_schema: None,
responsesapi_client_metadata: None,
})
.await?;
wait_for_event(&test.codex, |event| {
matches!(event, EventMsg::TurnComplete(_))
})
.await;
let requests = mock.requests();
assert_eq!(requests.len(), 2);
for call_id in [
"tool-search-dynamic-name",
"tool-search-dynamic-spaces",
"tool-search-dynamic-description",
"tool-search-dynamic-namespace",
"tool-search-dynamic-schema",
] {
assert!(
tool_search_output_has_namespace_child(
&requests[1],
call_id,
"orbit_ops",
"quasar_ping_beacon"
),
"expected query {call_id} to surface the quasar_ping_beacon tool: {:?}",
tool_search_output_tools(&requests[1], call_id)
);
}
Ok(())
}