feat: best-effort compact large tool schemas (#23904)

## Why

The `dev/cc/ref-def` branch preserves richer JSON Schema detail for
connector tools, including `$defs` and nested shapes. That improves
fidelity, but it pushes the largest connector schemas well past the
intended tool-schema budget. This PR adds a best-effort compaction pass
for unusually large tool input schemas so the p99 and max tails stay
small while ordinary schemas are left alone.

## What Changed

- Added best-effort large-schema compaction in
`codex-rs/tools/src/json_schema.rs` after schema sanitization and
definition pruning.
- Compaction runs as a waterfall only while the compact JSON budget
proxy is exceeded:
  1. Strip schema `description` metadata.
  2. Drop root `$defs` / `definitions`.
  3. Collapse deep nested complex schema objects to `{}`.
- Kept top-level argument names and immediate schema shape where
possible.

## Corpus Results

Scope: 2,025 schemas under `golden_schemas`, all parsed successfully.
Token count is `o200k_base` over compact JSON from
`parse_tool_input_schema`.

| Percentile | Before `origin/main` `4dbca61e20` | After branch
`dev/cc/ref-def` `f9bf071758` | After this PR |
|---|---:|---:|---:|
| p0 | 9 | 9 | 9 |
| p10 | 59 | 63 | 63 |
| p25 | 81 | 86 | 86 |
| p50 | 114 | 127 | 125 |
| p75 | 174 | 205 | 202 |
| p90 | 295 | 335 | 322 |
| p95 | 391 | 526 | 422 |
| p99 | 794 | 1,303 | 689 |
| max | 2,836 | 3,337 | 887 |

After this PR, `0 / 2,025` schemas are over 1k tokens.

### Compaction Savings

These are cumulative waterfall stages over the same corpus. Later passes
only run for schemas that are still over the compact JSON budget proxy.

| Stage | Total tokens | Step savings | Schemas changed by step |
|---|---:|---:|---:|
| No compaction | 391,862 | - | - |
| Strip schema `description` metadata | 350,961 | 40,901 | 66 |
| Drop root `$defs` / `definitions` | 340,683 | 10,278 | 13 |
| Collapse deep complex schemas to `{}` | 335,875 | 4,808 | 6 |
This commit is contained in:
Celia Chen
2026-05-21 18:26:17 -07:00
committed by GitHub
parent 7e802b22f1
commit 464ab40dfa
2 changed files with 553 additions and 0 deletions

View File

@@ -160,6 +160,7 @@ pub fn parse_tool_input_schema(input_schema: &JsonValue) -> Result<JsonSchema, s
let mut input_schema = input_schema.clone();
sanitize_json_schema(&mut input_schema);
prune_unreachable_definitions(&mut input_schema);
compact_large_tool_schema(&mut input_schema);
let schema: JsonSchema = serde_json::from_value(input_schema)?;
if matches!(
schema.schema_type,
@@ -170,6 +171,47 @@ pub fn parse_tool_input_schema(input_schema: &JsonValue) -> Result<JsonSchema, s
Ok(schema)
}
// Use compact normalized JSON bytes as a cheap local proxy for the 1k-token
// schema budget.
const MAX_COMPACT_TOOL_SCHEMA_BYTES: usize = 4_000;
const MAX_COMPACT_TOOL_SCHEMA_DEPTH: usize = 2;
/// Shrink unusually large tool schemas while preserving the top-level argument
/// surface. Compaction is best-effort rather than a hard cap: it runs only
/// after schema sanitization/pruning and applies increasingly lossy passes
/// while the schema remains over budget.
fn compact_large_tool_schema(value: &mut JsonValue) {
for pass in LARGE_SCHEMA_COMPACTION_PASSES {
if compact_schema_fits_budget(value) {
break;
}
pass(value);
}
}
type LargeSchemaCompactionPass = fn(&mut JsonValue);
const LARGE_SCHEMA_COMPACTION_PASSES: &[LargeSchemaCompactionPass] = &[
strip_schema_descriptions,
drop_schema_definitions,
collapse_deep_schema_objects_from_root,
];
fn collapse_deep_schema_objects_from_root(value: &mut JsonValue) {
collapse_deep_schema_objects(value, /*depth*/ 0);
}
fn compact_schema_fits_budget(value: &JsonValue) -> bool {
compact_normalized_schema_len(value) <= MAX_COMPACT_TOOL_SCHEMA_BYTES
}
fn compact_normalized_schema_len(value: &JsonValue) -> usize {
serde_json::from_value::<JsonSchema>(value.clone())
.and_then(|schema| serde_json::to_vec(&schema))
.map(|json| json.len())
.unwrap_or(0)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum DefinitionTraversal {
Include,
@@ -214,6 +256,130 @@ fn for_each_schema_child(
}
}
fn strip_schema_descriptions(value: &mut JsonValue) {
match value {
JsonValue::Array(values) => {
for value in values {
strip_schema_descriptions(value);
}
}
JsonValue::Object(map) => {
map.remove("description");
for_each_schema_child_mut(map, DefinitionTraversal::Include, &mut |value| {
strip_schema_descriptions(value);
});
}
_ => {}
}
}
fn for_each_schema_child_mut(
map: &mut serde_json::Map<String, JsonValue>,
definition_traversal: DefinitionTraversal,
visitor: &mut impl FnMut(&mut JsonValue),
) {
if let Some(properties) = map.get_mut("properties")
&& let Some(properties_map) = properties.as_object_mut()
{
for value in properties_map.values_mut() {
visitor(value);
}
}
for key in SCHEMA_CHILD_KEYS {
if let Some(value) = map.get_mut(key) {
visitor(value);
}
}
if let Some(additional_properties) = map.get_mut("additionalProperties")
&& !matches!(additional_properties, JsonValue::Bool(_))
{
visitor(additional_properties);
}
if definition_traversal == DefinitionTraversal::Include {
for key in DEFINITION_TABLE_KEYS {
if let Some(definitions) = map.get_mut(key)
&& let Some(definitions_map) = definitions.as_object_mut()
{
for value in definitions_map.values_mut() {
visitor(value);
}
}
}
}
}
/// Replace local definition refs with empty schemas before dropping root
/// definition tables, so downstream behavior does not depend on how a schema
/// parser handles refs to missing definitions.
fn drop_schema_definitions(value: &mut JsonValue) {
rewrite_definition_refs_to_empty_schemas(value);
let JsonValue::Object(map) = value else {
return;
};
for key in DEFINITION_TABLE_KEYS {
map.remove(key);
}
}
fn rewrite_definition_refs_to_empty_schemas(value: &mut JsonValue) {
match value {
JsonValue::Array(values) => {
for value in values {
rewrite_definition_refs_to_empty_schemas(value);
}
}
JsonValue::Object(map) => {
if map
.get("$ref")
.and_then(JsonValue::as_str)
.and_then(parse_local_definition_ref)
.is_some()
{
*value = json!({});
return;
}
for_each_schema_child_mut(map, DefinitionTraversal::Skip, &mut |value| {
rewrite_definition_refs_to_empty_schemas(value);
});
}
_ => {}
}
}
fn collapse_deep_schema_objects(value: &mut JsonValue, depth: usize) {
match value {
JsonValue::Array(values) => {
for value in values {
collapse_deep_schema_objects(value, depth);
}
}
JsonValue::Object(map) => {
if depth >= MAX_COMPACT_TOOL_SCHEMA_DEPTH && is_complex_schema_object(map) {
*value = json!({});
return;
}
for_each_schema_child_mut(map, DefinitionTraversal::Skip, &mut |value| {
collapse_deep_schema_objects(value, depth + 1);
});
}
_ => {}
}
}
fn is_complex_schema_object(map: &serde_json::Map<String, JsonValue>) -> bool {
SCHEMA_CHILD_KEYS.iter().any(|key| map.contains_key(*key))
|| map.contains_key("properties")
|| map.contains_key("additionalProperties")
|| map.contains_key("$ref")
}
/// Sanitize a JSON Schema (as serde_json::Value) so it can fit our limited
/// schema representation. This function:
/// - Ensures every typed schema object has a `"type"` when required.

View File

@@ -779,6 +779,393 @@ fn parse_tool_input_schema_preserves_explicit_enum_type_union() {
);
}
fn many_string_properties(count: usize) -> serde_json::Map<String, serde_json::Value> {
(0..count)
.map(|index| {
(
format!("field_{index:03}"),
serde_json::json!({ "type": "string" }),
)
})
.collect()
}
#[test]
fn parse_large_tool_input_schema_stops_after_descriptions_when_under_budget() {
let schema = parse_tool_input_schema(&serde_json::json!({
"type": "object",
"description": "x".repeat(4_500),
"properties": {
"metadata": {
"$ref": "#/$defs/metadata"
}
},
"$defs": {
"metadata": {
"type": "string",
"description": "Metadata value"
}
}
}))
.expect("parse schema");
assert_eq!(
serde_json::to_value(schema).expect("serialize schema"),
serde_json::json!({
"type": "object",
"properties": {
"metadata": {
"$ref": "#/$defs/metadata"
}
},
"$defs": {
"metadata": {
"type": "string"
}
}
})
);
}
#[test]
fn parse_large_tool_input_schema_ignores_dropped_metadata_for_budget() {
let schema = parse_tool_input_schema(&serde_json::json!({
"type": "object",
"properties": {
"event": {
"type": "object",
"title": "Calendar event",
"properties": {
"recurrence": {
"type": "object",
"examples": [
{
"payload": "x".repeat(4_500)
}
],
"properties": {
"pattern": {
"type": "string",
"title": "Recurrence pattern"
}
}
}
}
}
}
}))
.expect("parse schema");
assert_eq!(
serde_json::to_value(schema).expect("serialize schema"),
serde_json::json!({
"type": "object",
"properties": {
"event": {
"type": "object",
"properties": {
"recurrence": {
"type": "object",
"properties": {
"pattern": {
"type": "string"
}
}
}
}
}
}
})
);
}
#[test]
fn parse_large_tool_input_schema_stops_after_dropping_root_definitions_when_under_budget() {
let schema = parse_tool_input_schema(&serde_json::json!({
"type": "object",
"description": "x".repeat(4_500),
"properties": {
"event": {
"type": "object",
"description": "Calendar event",
"properties": {
"recurrence": {
"type": "object",
"description": "Recurrence settings",
"properties": {
"pattern": {
"type": "string",
"description": "Recurrence pattern"
}
}
}
}
},
"metadata": {
"$ref": "#/$defs/metadata"
}
},
"$defs": {
"metadata": {
"type": "object",
"description": "metadata object",
"properties": many_string_properties(/*count*/ 300)
}
}
}))
.expect("parse schema");
assert_eq!(
serde_json::to_value(schema).expect("serialize schema"),
serde_json::json!({
"type": "object",
"properties": {
"event": {
"type": "object",
"properties": {
"recurrence": {
"type": "object",
"properties": {
"pattern": {
"type": "string"
}
}
}
}
},
"metadata": {}
}
})
);
}
#[test]
fn parse_large_tool_input_schema_strips_descriptions_without_removing_description_property() {
let schema = parse_tool_input_schema(&serde_json::json!({
"type": "object",
"description": "x".repeat(4_500),
"properties": {
"description": {
"type": "string",
"description": "User-facing description value"
},
"metadata": {
"type": "object",
"description": "Metadata object",
"properties": {
"label": {
"type": "string",
"description": "Metadata label"
}
}
},
"tags": {
"type": "array",
"description": "Tag list",
"items": {
"type": "string",
"description": "Tag value"
}
},
"extras": {
"type": "object",
"additionalProperties": {
"type": "string",
"description": "Extra value"
}
},
"choice": {
"description": "Choice value",
"anyOf": [
{
"type": "string",
"description": "String choice"
},
{
"type": "number",
"description": "Number choice"
}
]
}
}
}))
.expect("parse schema");
assert_eq!(
serde_json::to_value(schema).expect("serialize schema"),
serde_json::json!({
"type": "object",
"properties": {
"choice": {
"anyOf": [
{
"type": "string"
},
{
"type": "number"
}
]
},
"description": {
"type": "string"
},
"extras": {
"type": "object",
"properties": {},
"additionalProperties": {
"type": "string"
}
},
"metadata": {
"type": "object",
"properties": {
"label": {
"type": "string"
}
}
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
}
}
})
);
}
#[test]
fn parse_large_tool_input_schema_preserves_object_enum_literal_descriptions() {
let schema = parse_tool_input_schema(&serde_json::json!({
"type": "object",
"description": "x".repeat(4_500),
"properties": {
"choice": {
"enum": [
{
"description": "first literal",
"id": 1
},
{
"description": "second literal",
"id": 2
}
]
}
}
}))
.expect("parse schema");
assert_eq!(
serde_json::to_value(schema).expect("serialize schema"),
serde_json::json!({
"type": "object",
"properties": {
"choice": {
"type": "string",
"enum": [
{
"description": "first literal",
"id": 1
},
{
"description": "second literal",
"id": 2
}
]
}
}
})
);
}
#[test]
fn collapse_deep_schema_objects_traverses_schema_children() {
let mut schema = serde_json::json!({
"type": "object",
"properties": {
"object_parent": {
"type": "object",
"properties": {
"complex": {
"type": "object",
"properties": {
"leaf": { "type": "string" }
}
},
"scalar": {
"type": "string"
}
}
},
"array_parent": {
"type": "array",
"items": {
"type": "object",
"properties": {
"leaf": { "type": "string" }
}
}
},
"map_parent": {
"type": "object",
"additionalProperties": {
"type": "object",
"properties": {
"leaf": { "type": "string" }
}
}
},
"union_parent": {
"anyOf": [
{
"type": "object",
"properties": {
"leaf": { "type": "string" }
}
},
{ "type": "string" }
]
}
}
});
super::collapse_deep_schema_objects(&mut schema, /*depth*/ 0);
assert_eq!(
schema,
serde_json::json!({
"type": "object",
"properties": {
"object_parent": {
"type": "object",
"properties": {
"complex": {},
"scalar": {
"type": "string"
}
}
},
"array_parent": {
"type": "array",
"items": {}
},
"map_parent": {
"type": "object",
"additionalProperties": {}
},
"union_parent": {
"anyOf": [
{},
{ "type": "string" }
]
}
}
})
);
}
#[test]
fn parse_tool_input_schema_preserves_string_enum_constraints() {
// Example schema shape: