chore: apply truncation policy to unified_exec (#19247)

we were not respecting turn's `truncation_policy` to clamp output tokens
for `unified_exec` and `write_stdin`.

this meant truncation was only being applied by `ContextManager` before
the output was stored in-memory (so it _was_ being truncated from
model-visible context), but the full output was persisted to rollout on
disk.

now we respect that `truncation_policy` and `ContextManager`-level
truncation remains a backup.

### Tests
added tests, tested locally.
This commit is contained in:
sayan-oai
2026-04-24 00:17:39 -07:00
committed by GitHub
parent ac8c9fc49c
commit e083b6c757
2 changed files with 189 additions and 4 deletions

View File

@@ -7,6 +7,7 @@ use anyhow::Context;
use anyhow::Result;
use codex_exec_server::CreateDirectoryOptions;
use codex_features::Feature;
use codex_protocol::models::ResponseItem;
use codex_protocol::protocol::AskForApproval;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::ExecCommandSource;
@@ -157,6 +158,26 @@ fn collect_tool_outputs(bodies: &[Value]) -> Result<HashMap<String, ParsedUnifie
Ok(outputs)
}
async fn wait_for_raw_unified_exec_output(
test: &TestCodex,
call_id: &str,
) -> Result<ParsedUnifiedExecOutput> {
let content = wait_for_event_match(&test.codex, |event| match event {
EventMsg::RawResponseItem(raw) => match &raw.item {
ResponseItem::FunctionCallOutput {
call_id: output_call_id,
output,
} if output_call_id == call_id => output.text_content().map(str::to_string),
_ => None,
},
_ => None,
})
.await;
parse_unified_exec_output(&content)
.with_context(|| format!("failed to parse raw unified exec output for {call_id}"))
}
async fn submit_unified_exec_turn(
test: &TestCodex,
prompt: &str,
@@ -1230,6 +1251,157 @@ async fn exec_command_reports_chunk_and_exit_metadata() -> Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn exec_command_clamps_model_requested_max_output_tokens_to_policy() -> Result<()> {
skip_if_no_network!(Ok(()));
skip_if_sandbox!(Ok(()));
skip_if_windows!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex().with_model("gpt-5.4").with_config(|config| {
config.use_experimental_unified_exec_tool = true;
config.tool_output_token_limit = Some(50);
config
.features
.enable(Feature::UnifiedExec)
.expect("test config should allow feature update");
});
let test = builder.build_remote_aware(&server).await?;
let call_id = "uexec-clamped-max-output";
let args = serde_json::json!({
"cmd": "line_number=1; while [ \"$line_number\" -le 999 ]; do printf 'EXEC-LINE-%04d xxxxxxxxxxxxxxxxxxxx\\n' \"$line_number\"; line_number=$((line_number + 1)); done",
"yield_time_ms": 3_000,
"max_output_tokens": 70_000,
});
let responses = vec![
sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "exec_command", &serde_json::to_string(&args)?),
ev_completed("resp-1"),
]),
sse(vec![
ev_response_created("resp-2"),
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
];
mount_sse_sequence(&server, responses).await;
submit_unified_exec_turn(
&test,
"run clamped max output test",
SandboxPolicy::DangerFullAccess,
)
.await?;
let output = wait_for_raw_unified_exec_output(&test, call_id).await?;
assert_eq!(output.original_token_count, Some(8_991));
let output_text = output.output.replace("\r\n", "\n");
assert_regex_match(
r"^Total output lines: 999\n\nEXEC-LINE-0001 x{20}\nEXEC-LINE-0002 x{20}\nEXEC-LINE-0003 x{13}…8941 tokens truncated…E-0997 x{20}\nEXEC-LINE-0998 x{20}\nEXEC-LINE-0999 x{20}\n$",
&output_text,
);
wait_for_event(&test.codex, |event| {
matches!(event, EventMsg::TurnComplete(_))
})
.await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn write_stdin_clamps_model_requested_max_output_tokens_to_policy() -> Result<()> {
skip_if_no_network!(Ok(()));
skip_if_sandbox!(Ok(()));
skip_if_windows!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex().with_model("gpt-5.4").with_config(|config| {
config.use_experimental_unified_exec_tool = true;
config.tool_output_token_limit = Some(50);
config
.features
.enable(Feature::UnifiedExec)
.expect("test config should allow feature update");
});
let test = builder.build_remote_aware(&server).await?;
let start_call_id = "uexec-stdin-clamp-start";
let start_args = serde_json::json!({
"cmd": "printf 'READY\\n'; read trigger; line_number=1; while [ \"$line_number\" -le 999 ]; do printf 'STDIN-LINE-%04d yyyyyyyyyyyyyyyyyyyy\\n' \"$line_number\"; line_number=$((line_number + 1)); done",
"yield_time_ms": 500,
"tty": true,
});
let stdin_call_id = "uexec-stdin-clamped-max-output";
let stdin_args = serde_json::json!({
"chars": "go\n",
"session_id": 1000,
"yield_time_ms": 3_000,
"max_output_tokens": 70_000,
});
let responses = vec![
sse(vec![
ev_response_created("resp-1"),
ev_function_call(
start_call_id,
"exec_command",
&serde_json::to_string(&start_args)?,
),
ev_completed("resp-1"),
]),
sse(vec![
ev_response_created("resp-2"),
ev_function_call(
stdin_call_id,
"write_stdin",
&serde_json::to_string(&stdin_args)?,
),
ev_completed("resp-2"),
]),
sse(vec![
ev_response_created("resp-3"),
ev_assistant_message("msg-1", "done"),
ev_completed("resp-3"),
]),
];
mount_sse_sequence(&server, responses).await;
submit_unified_exec_turn(
&test,
"run clamped write_stdin output test",
SandboxPolicy::DangerFullAccess,
)
.await?;
let start_output = wait_for_raw_unified_exec_output(&test, start_call_id).await?;
assert!(
start_output.process_id.is_some(),
"start command should leave a running process for write_stdin"
);
let stdin_output = wait_for_raw_unified_exec_output(&test, stdin_call_id).await?;
assert_eq!(stdin_output.original_token_count, Some(9_492));
let stdin_output_text = stdin_output.output.replace("\r\n", "\n");
assert_regex_match(
r"^Total output lines: 1000\n\ngo\nSTDIN-LINE-0001 y{20}\nSTDIN-LINE-0002 y{20}\nSTDIN-LINE-0003 yyyy…9442 tokens truncated…7 y{20}\nSTDIN-LINE-0998 y{20}\nSTDIN-LINE-0999 y{20}\n$",
&stdin_output_text,
);
wait_for_event(&test.codex, |event| {
matches!(event, EventMsg::TurnComplete(_))
})
.await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn unified_exec_defaults_to_pipe() -> Result<()> {
skip_if_no_network!(Ok(()));