mirror of
https://github.com/openai/codex.git
synced 2026-05-04 03:16:31 +00:00
chore: apply truncation policy to unified_exec (#19247)
we were not respecting turn's `truncation_policy` to clamp output tokens for `unified_exec` and `write_stdin`. this meant truncation was only being applied by `ContextManager` before the output was stored in-memory (so it _was_ being truncated from model-visible context), but the full output was persisted to rollout on disk. now we respect that `truncation_policy` and `ContextManager`-level truncation remains a backup. ### Tests added tests, tested locally.
This commit is contained in:
@@ -7,6 +7,7 @@ use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use codex_exec_server::CreateDirectoryOptions;
|
||||
use codex_features::Feature;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::protocol::AskForApproval;
|
||||
use codex_protocol::protocol::EventMsg;
|
||||
use codex_protocol::protocol::ExecCommandSource;
|
||||
@@ -157,6 +158,26 @@ fn collect_tool_outputs(bodies: &[Value]) -> Result<HashMap<String, ParsedUnifie
|
||||
Ok(outputs)
|
||||
}
|
||||
|
||||
async fn wait_for_raw_unified_exec_output(
|
||||
test: &TestCodex,
|
||||
call_id: &str,
|
||||
) -> Result<ParsedUnifiedExecOutput> {
|
||||
let content = wait_for_event_match(&test.codex, |event| match event {
|
||||
EventMsg::RawResponseItem(raw) => match &raw.item {
|
||||
ResponseItem::FunctionCallOutput {
|
||||
call_id: output_call_id,
|
||||
output,
|
||||
} if output_call_id == call_id => output.text_content().map(str::to_string),
|
||||
_ => None,
|
||||
},
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
parse_unified_exec_output(&content)
|
||||
.with_context(|| format!("failed to parse raw unified exec output for {call_id}"))
|
||||
}
|
||||
|
||||
async fn submit_unified_exec_turn(
|
||||
test: &TestCodex,
|
||||
prompt: &str,
|
||||
@@ -1230,6 +1251,157 @@ async fn exec_command_reports_chunk_and_exit_metadata() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn exec_command_clamps_model_requested_max_output_tokens_to_policy() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
skip_if_sandbox!(Ok(()));
|
||||
skip_if_windows!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
||||
let mut builder = test_codex().with_model("gpt-5.4").with_config(|config| {
|
||||
config.use_experimental_unified_exec_tool = true;
|
||||
config.tool_output_token_limit = Some(50);
|
||||
config
|
||||
.features
|
||||
.enable(Feature::UnifiedExec)
|
||||
.expect("test config should allow feature update");
|
||||
});
|
||||
let test = builder.build_remote_aware(&server).await?;
|
||||
|
||||
let call_id = "uexec-clamped-max-output";
|
||||
let args = serde_json::json!({
|
||||
"cmd": "line_number=1; while [ \"$line_number\" -le 999 ]; do printf 'EXEC-LINE-%04d xxxxxxxxxxxxxxxxxxxx\\n' \"$line_number\"; line_number=$((line_number + 1)); done",
|
||||
"yield_time_ms": 3_000,
|
||||
"max_output_tokens": 70_000,
|
||||
});
|
||||
|
||||
let responses = vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "exec_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse(vec![
|
||||
ev_response_created("resp-2"),
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
];
|
||||
mount_sse_sequence(&server, responses).await;
|
||||
|
||||
submit_unified_exec_turn(
|
||||
&test,
|
||||
"run clamped max output test",
|
||||
SandboxPolicy::DangerFullAccess,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let output = wait_for_raw_unified_exec_output(&test, call_id).await?;
|
||||
assert_eq!(output.original_token_count, Some(8_991));
|
||||
let output_text = output.output.replace("\r\n", "\n");
|
||||
assert_regex_match(
|
||||
r"^Total output lines: 999\n\nEXEC-LINE-0001 x{20}\nEXEC-LINE-0002 x{20}\nEXEC-LINE-0003 x{13}…8941 tokens truncated…E-0997 x{20}\nEXEC-LINE-0998 x{20}\nEXEC-LINE-0999 x{20}\n$",
|
||||
&output_text,
|
||||
);
|
||||
|
||||
wait_for_event(&test.codex, |event| {
|
||||
matches!(event, EventMsg::TurnComplete(_))
|
||||
})
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn write_stdin_clamps_model_requested_max_output_tokens_to_policy() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
skip_if_sandbox!(Ok(()));
|
||||
skip_if_windows!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
||||
let mut builder = test_codex().with_model("gpt-5.4").with_config(|config| {
|
||||
config.use_experimental_unified_exec_tool = true;
|
||||
config.tool_output_token_limit = Some(50);
|
||||
config
|
||||
.features
|
||||
.enable(Feature::UnifiedExec)
|
||||
.expect("test config should allow feature update");
|
||||
});
|
||||
let test = builder.build_remote_aware(&server).await?;
|
||||
|
||||
let start_call_id = "uexec-stdin-clamp-start";
|
||||
let start_args = serde_json::json!({
|
||||
"cmd": "printf 'READY\\n'; read trigger; line_number=1; while [ \"$line_number\" -le 999 ]; do printf 'STDIN-LINE-%04d yyyyyyyyyyyyyyyyyyyy\\n' \"$line_number\"; line_number=$((line_number + 1)); done",
|
||||
"yield_time_ms": 500,
|
||||
"tty": true,
|
||||
});
|
||||
|
||||
let stdin_call_id = "uexec-stdin-clamped-max-output";
|
||||
let stdin_args = serde_json::json!({
|
||||
"chars": "go\n",
|
||||
"session_id": 1000,
|
||||
"yield_time_ms": 3_000,
|
||||
"max_output_tokens": 70_000,
|
||||
});
|
||||
|
||||
let responses = vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(
|
||||
start_call_id,
|
||||
"exec_command",
|
||||
&serde_json::to_string(&start_args)?,
|
||||
),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse(vec![
|
||||
ev_response_created("resp-2"),
|
||||
ev_function_call(
|
||||
stdin_call_id,
|
||||
"write_stdin",
|
||||
&serde_json::to_string(&stdin_args)?,
|
||||
),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
sse(vec![
|
||||
ev_response_created("resp-3"),
|
||||
ev_assistant_message("msg-1", "done"),
|
||||
ev_completed("resp-3"),
|
||||
]),
|
||||
];
|
||||
mount_sse_sequence(&server, responses).await;
|
||||
|
||||
submit_unified_exec_turn(
|
||||
&test,
|
||||
"run clamped write_stdin output test",
|
||||
SandboxPolicy::DangerFullAccess,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let start_output = wait_for_raw_unified_exec_output(&test, start_call_id).await?;
|
||||
assert!(
|
||||
start_output.process_id.is_some(),
|
||||
"start command should leave a running process for write_stdin"
|
||||
);
|
||||
|
||||
let stdin_output = wait_for_raw_unified_exec_output(&test, stdin_call_id).await?;
|
||||
assert_eq!(stdin_output.original_token_count, Some(9_492));
|
||||
let stdin_output_text = stdin_output.output.replace("\r\n", "\n");
|
||||
assert_regex_match(
|
||||
r"^Total output lines: 1000\n\ngo\nSTDIN-LINE-0001 y{20}\nSTDIN-LINE-0002 y{20}\nSTDIN-LINE-0003 yyyy…9442 tokens truncated…7 y{20}\nSTDIN-LINE-0998 y{20}\nSTDIN-LINE-0999 y{20}\n$",
|
||||
&stdin_output_text,
|
||||
);
|
||||
|
||||
wait_for_event(&test.codex, |event| {
|
||||
matches!(event, EventMsg::TurnComplete(_))
|
||||
})
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn unified_exec_defaults_to_pipe() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
Reference in New Issue
Block a user