mirror of
https://github.com/openai/codex.git
synced 2026-05-23 20:44:50 +00:00
feat: best-effort compact large tool schemas (#23904)
## Why
The `dev/cc/ref-def` branch preserves richer JSON Schema detail for
connector tools, including `$defs` and nested shapes. That improves
fidelity, but it pushes the largest connector schemas well past the
intended tool-schema budget. This PR adds a best-effort compaction pass
for unusually large tool input schemas so the p99 and max tails stay
small while ordinary schemas are left alone.
## What Changed
- Added best-effort large-schema compaction in
`codex-rs/tools/src/json_schema.rs` after schema sanitization and
definition pruning.
- Compaction runs as a waterfall only while the compact JSON budget
proxy is exceeded:
1. Strip schema `description` metadata.
2. Drop root `$defs` / `definitions`.
3. Collapse deep nested complex schema objects to `{}`.
- Kept top-level argument names and immediate schema shape where
possible.
## Corpus Results
Scope: 2,025 schemas under `golden_schemas`, all parsed successfully.
Token count is `o200k_base` over compact JSON from
`parse_tool_input_schema`.
| Percentile | Before `origin/main` `4dbca61e20` | After branch
`dev/cc/ref-def` `f9bf071758` | After this PR |
|---|---:|---:|---:|
| p0 | 9 | 9 | 9 |
| p10 | 59 | 63 | 63 |
| p25 | 81 | 86 | 86 |
| p50 | 114 | 127 | 125 |
| p75 | 174 | 205 | 202 |
| p90 | 295 | 335 | 322 |
| p95 | 391 | 526 | 422 |
| p99 | 794 | 1,303 | 689 |
| max | 2,836 | 3,337 | 887 |
After this PR, `0 / 2,025` schemas are over 1k tokens.
### Compaction Savings
These are cumulative waterfall stages over the same corpus. Later passes
only run for schemas that are still over the compact JSON budget proxy.
| Stage | Total tokens | Step savings | Schemas changed by step |
|---|---:|---:|---:|
| No compaction | 391,862 | - | - |
| Strip schema `description` metadata | 350,961 | 40,901 | 66 |
| Drop root `$defs` / `definitions` | 340,683 | 10,278 | 13 |
| Collapse deep complex schemas to `{}` | 335,875 | 4,808 | 6 |
This commit is contained in:
@@ -160,6 +160,7 @@ pub fn parse_tool_input_schema(input_schema: &JsonValue) -> Result<JsonSchema, s
|
||||
let mut input_schema = input_schema.clone();
|
||||
sanitize_json_schema(&mut input_schema);
|
||||
prune_unreachable_definitions(&mut input_schema);
|
||||
compact_large_tool_schema(&mut input_schema);
|
||||
let schema: JsonSchema = serde_json::from_value(input_schema)?;
|
||||
if matches!(
|
||||
schema.schema_type,
|
||||
@@ -170,6 +171,47 @@ pub fn parse_tool_input_schema(input_schema: &JsonValue) -> Result<JsonSchema, s
|
||||
Ok(schema)
|
||||
}
|
||||
|
||||
// Use compact normalized JSON bytes as a cheap local proxy for the 1k-token
|
||||
// schema budget.
|
||||
const MAX_COMPACT_TOOL_SCHEMA_BYTES: usize = 4_000;
|
||||
const MAX_COMPACT_TOOL_SCHEMA_DEPTH: usize = 2;
|
||||
|
||||
/// Shrink unusually large tool schemas while preserving the top-level argument
|
||||
/// surface. Compaction is best-effort rather than a hard cap: it runs only
|
||||
/// after schema sanitization/pruning and applies increasingly lossy passes
|
||||
/// while the schema remains over budget.
|
||||
fn compact_large_tool_schema(value: &mut JsonValue) {
|
||||
for pass in LARGE_SCHEMA_COMPACTION_PASSES {
|
||||
if compact_schema_fits_budget(value) {
|
||||
break;
|
||||
}
|
||||
pass(value);
|
||||
}
|
||||
}
|
||||
|
||||
type LargeSchemaCompactionPass = fn(&mut JsonValue);
|
||||
|
||||
const LARGE_SCHEMA_COMPACTION_PASSES: &[LargeSchemaCompactionPass] = &[
|
||||
strip_schema_descriptions,
|
||||
drop_schema_definitions,
|
||||
collapse_deep_schema_objects_from_root,
|
||||
];
|
||||
|
||||
fn collapse_deep_schema_objects_from_root(value: &mut JsonValue) {
|
||||
collapse_deep_schema_objects(value, /*depth*/ 0);
|
||||
}
|
||||
|
||||
fn compact_schema_fits_budget(value: &JsonValue) -> bool {
|
||||
compact_normalized_schema_len(value) <= MAX_COMPACT_TOOL_SCHEMA_BYTES
|
||||
}
|
||||
|
||||
fn compact_normalized_schema_len(value: &JsonValue) -> usize {
|
||||
serde_json::from_value::<JsonSchema>(value.clone())
|
||||
.and_then(|schema| serde_json::to_vec(&schema))
|
||||
.map(|json| json.len())
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
enum DefinitionTraversal {
|
||||
Include,
|
||||
@@ -214,6 +256,130 @@ fn for_each_schema_child(
|
||||
}
|
||||
}
|
||||
|
||||
fn strip_schema_descriptions(value: &mut JsonValue) {
|
||||
match value {
|
||||
JsonValue::Array(values) => {
|
||||
for value in values {
|
||||
strip_schema_descriptions(value);
|
||||
}
|
||||
}
|
||||
JsonValue::Object(map) => {
|
||||
map.remove("description");
|
||||
for_each_schema_child_mut(map, DefinitionTraversal::Include, &mut |value| {
|
||||
strip_schema_descriptions(value);
|
||||
});
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn for_each_schema_child_mut(
|
||||
map: &mut serde_json::Map<String, JsonValue>,
|
||||
definition_traversal: DefinitionTraversal,
|
||||
visitor: &mut impl FnMut(&mut JsonValue),
|
||||
) {
|
||||
if let Some(properties) = map.get_mut("properties")
|
||||
&& let Some(properties_map) = properties.as_object_mut()
|
||||
{
|
||||
for value in properties_map.values_mut() {
|
||||
visitor(value);
|
||||
}
|
||||
}
|
||||
|
||||
for key in SCHEMA_CHILD_KEYS {
|
||||
if let Some(value) = map.get_mut(key) {
|
||||
visitor(value);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(additional_properties) = map.get_mut("additionalProperties")
|
||||
&& !matches!(additional_properties, JsonValue::Bool(_))
|
||||
{
|
||||
visitor(additional_properties);
|
||||
}
|
||||
|
||||
if definition_traversal == DefinitionTraversal::Include {
|
||||
for key in DEFINITION_TABLE_KEYS {
|
||||
if let Some(definitions) = map.get_mut(key)
|
||||
&& let Some(definitions_map) = definitions.as_object_mut()
|
||||
{
|
||||
for value in definitions_map.values_mut() {
|
||||
visitor(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace local definition refs with empty schemas before dropping root
|
||||
/// definition tables, so downstream behavior does not depend on how a schema
|
||||
/// parser handles refs to missing definitions.
|
||||
fn drop_schema_definitions(value: &mut JsonValue) {
|
||||
rewrite_definition_refs_to_empty_schemas(value);
|
||||
|
||||
let JsonValue::Object(map) = value else {
|
||||
return;
|
||||
};
|
||||
|
||||
for key in DEFINITION_TABLE_KEYS {
|
||||
map.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
fn rewrite_definition_refs_to_empty_schemas(value: &mut JsonValue) {
|
||||
match value {
|
||||
JsonValue::Array(values) => {
|
||||
for value in values {
|
||||
rewrite_definition_refs_to_empty_schemas(value);
|
||||
}
|
||||
}
|
||||
JsonValue::Object(map) => {
|
||||
if map
|
||||
.get("$ref")
|
||||
.and_then(JsonValue::as_str)
|
||||
.and_then(parse_local_definition_ref)
|
||||
.is_some()
|
||||
{
|
||||
*value = json!({});
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_schema_child_mut(map, DefinitionTraversal::Skip, &mut |value| {
|
||||
rewrite_definition_refs_to_empty_schemas(value);
|
||||
});
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn collapse_deep_schema_objects(value: &mut JsonValue, depth: usize) {
|
||||
match value {
|
||||
JsonValue::Array(values) => {
|
||||
for value in values {
|
||||
collapse_deep_schema_objects(value, depth);
|
||||
}
|
||||
}
|
||||
JsonValue::Object(map) => {
|
||||
if depth >= MAX_COMPACT_TOOL_SCHEMA_DEPTH && is_complex_schema_object(map) {
|
||||
*value = json!({});
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_schema_child_mut(map, DefinitionTraversal::Skip, &mut |value| {
|
||||
collapse_deep_schema_objects(value, depth + 1);
|
||||
});
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_complex_schema_object(map: &serde_json::Map<String, JsonValue>) -> bool {
|
||||
SCHEMA_CHILD_KEYS.iter().any(|key| map.contains_key(*key))
|
||||
|| map.contains_key("properties")
|
||||
|| map.contains_key("additionalProperties")
|
||||
|| map.contains_key("$ref")
|
||||
}
|
||||
|
||||
/// Sanitize a JSON Schema (as serde_json::Value) so it can fit our limited
|
||||
/// schema representation. This function:
|
||||
/// - Ensures every typed schema object has a `"type"` when required.
|
||||
|
||||
@@ -779,6 +779,393 @@ fn parse_tool_input_schema_preserves_explicit_enum_type_union() {
|
||||
);
|
||||
}
|
||||
|
||||
fn many_string_properties(count: usize) -> serde_json::Map<String, serde_json::Value> {
|
||||
(0..count)
|
||||
.map(|index| {
|
||||
(
|
||||
format!("field_{index:03}"),
|
||||
serde_json::json!({ "type": "string" }),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_large_tool_input_schema_stops_after_descriptions_when_under_budget() {
|
||||
let schema = parse_tool_input_schema(&serde_json::json!({
|
||||
"type": "object",
|
||||
"description": "x".repeat(4_500),
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"$ref": "#/$defs/metadata"
|
||||
}
|
||||
},
|
||||
"$defs": {
|
||||
"metadata": {
|
||||
"type": "string",
|
||||
"description": "Metadata value"
|
||||
}
|
||||
}
|
||||
}))
|
||||
.expect("parse schema");
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_value(schema).expect("serialize schema"),
|
||||
serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metadata": {
|
||||
"$ref": "#/$defs/metadata"
|
||||
}
|
||||
},
|
||||
"$defs": {
|
||||
"metadata": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_large_tool_input_schema_ignores_dropped_metadata_for_budget() {
|
||||
let schema = parse_tool_input_schema(&serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"event": {
|
||||
"type": "object",
|
||||
"title": "Calendar event",
|
||||
"properties": {
|
||||
"recurrence": {
|
||||
"type": "object",
|
||||
"examples": [
|
||||
{
|
||||
"payload": "x".repeat(4_500)
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"title": "Recurrence pattern"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}))
|
||||
.expect("parse schema");
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_value(schema).expect("serialize schema"),
|
||||
serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"event": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"recurrence": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_large_tool_input_schema_stops_after_dropping_root_definitions_when_under_budget() {
|
||||
let schema = parse_tool_input_schema(&serde_json::json!({
|
||||
"type": "object",
|
||||
"description": "x".repeat(4_500),
|
||||
"properties": {
|
||||
"event": {
|
||||
"type": "object",
|
||||
"description": "Calendar event",
|
||||
"properties": {
|
||||
"recurrence": {
|
||||
"type": "object",
|
||||
"description": "Recurrence settings",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Recurrence pattern"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"$ref": "#/$defs/metadata"
|
||||
}
|
||||
},
|
||||
"$defs": {
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "metadata object",
|
||||
"properties": many_string_properties(/*count*/ 300)
|
||||
}
|
||||
}
|
||||
}))
|
||||
.expect("parse schema");
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_value(schema).expect("serialize schema"),
|
||||
serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"event": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"recurrence": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_large_tool_input_schema_strips_descriptions_without_removing_description_property() {
|
||||
let schema = parse_tool_input_schema(&serde_json::json!({
|
||||
"type": "object",
|
||||
"description": "x".repeat(4_500),
|
||||
"properties": {
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "User-facing description value"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"description": "Metadata object",
|
||||
"properties": {
|
||||
"label": {
|
||||
"type": "string",
|
||||
"description": "Metadata label"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"description": "Tag list",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"description": "Tag value"
|
||||
}
|
||||
},
|
||||
"extras": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string",
|
||||
"description": "Extra value"
|
||||
}
|
||||
},
|
||||
"choice": {
|
||||
"description": "Choice value",
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string",
|
||||
"description": "String choice"
|
||||
},
|
||||
{
|
||||
"type": "number",
|
||||
"description": "Number choice"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}))
|
||||
.expect("parse schema");
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_value(schema).expect("serialize schema"),
|
||||
serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"choice": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"extras": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"label": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_large_tool_input_schema_preserves_object_enum_literal_descriptions() {
|
||||
let schema = parse_tool_input_schema(&serde_json::json!({
|
||||
"type": "object",
|
||||
"description": "x".repeat(4_500),
|
||||
"properties": {
|
||||
"choice": {
|
||||
"enum": [
|
||||
{
|
||||
"description": "first literal",
|
||||
"id": 1
|
||||
},
|
||||
{
|
||||
"description": "second literal",
|
||||
"id": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}))
|
||||
.expect("parse schema");
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_value(schema).expect("serialize schema"),
|
||||
serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"choice": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
{
|
||||
"description": "first literal",
|
||||
"id": 1
|
||||
},
|
||||
{
|
||||
"description": "second literal",
|
||||
"id": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn collapse_deep_schema_objects_traverses_schema_children() {
|
||||
let mut schema = serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"object_parent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"complex": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"leaf": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"scalar": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"array_parent": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"leaf": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"map_parent": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"leaf": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"union_parent": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"leaf": { "type": "string" }
|
||||
}
|
||||
},
|
||||
{ "type": "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
super::collapse_deep_schema_objects(&mut schema, /*depth*/ 0);
|
||||
|
||||
assert_eq!(
|
||||
schema,
|
||||
serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"object_parent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"complex": {},
|
||||
"scalar": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"array_parent": {
|
||||
"type": "array",
|
||||
"items": {}
|
||||
},
|
||||
"map_parent": {
|
||||
"type": "object",
|
||||
"additionalProperties": {}
|
||||
},
|
||||
"union_parent": {
|
||||
"anyOf": [
|
||||
{},
|
||||
{ "type": "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tool_input_schema_preserves_string_enum_constraints() {
|
||||
// Example schema shape:
|
||||
|
||||
Reference in New Issue
Block a user