chore: apply truncation policy to unified_exec (#19247)

we were not respecting turn's `truncation_policy` to clamp output tokens for `unified_exec` and `write_stdin`. this meant truncation was only being applied by `ContextManager` before the output was stored in-memory (so it _was_ being truncated from model-visible context), but the full output was persisted to rollout on disk. now we respect that `truncation_policy` and `ContextManager`-level truncation remains a backup. ### Tests added tests, tested locally.
2026-05-04 03:16:31 +00:00 · 2026-04-24 00:17:39 -07:00
parent ac8c9fc49c
commit e083b6c757
2 changed files with 189 additions and 4 deletions
--- a/codex-rs/core/tests/suite/unified_exec.rs
+++ b/codex-rs/core/tests/suite/unified_exec.rs
@@ -7,6 +7,7 @@ use anyhow::Context;
 use anyhow::Result;
 use codex_exec_server::CreateDirectoryOptions;
 use codex_features::Feature;
+use codex_protocol::models::ResponseItem;
 use codex_protocol::protocol::AskForApproval;
 use codex_protocol::protocol::EventMsg;
 use codex_protocol::protocol::ExecCommandSource;
@@ -157,6 +158,26 @@ fn collect_tool_outputs(bodies: &[Value]) -> Result<HashMap<String, ParsedUnifie
    Ok(outputs)
 }

+async fn wait_for_raw_unified_exec_output(
+    test: &TestCodex,
+    call_id: &str,
+) -> Result<ParsedUnifiedExecOutput> {
+    let content = wait_for_event_match(&test.codex, |event| match event {
+        EventMsg::RawResponseItem(raw) => match &raw.item {
+            ResponseItem::FunctionCallOutput {
+                call_id: output_call_id,
+                output,
+            } if output_call_id == call_id => output.text_content().map(str::to_string),
+            _ => None,
+        },
+        _ => None,
+    })
+    .await;
+
+    parse_unified_exec_output(&content)
+        .with_context(|| format!("failed to parse raw unified exec output for {call_id}"))
+}
+
 async fn submit_unified_exec_turn(
    test: &TestCodex,
    prompt: &str,
@@ -1230,6 +1251,157 @@ async fn exec_command_reports_chunk_and_exit_metadata() -> Result<()> {
    Ok(())
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn exec_command_clamps_model_requested_max_output_tokens_to_policy() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+    skip_if_sandbox!(Ok(()));
+    skip_if_windows!(Ok(()));
+
+    let server = start_mock_server().await;
+
+    let mut builder = test_codex().with_model("gpt-5.4").with_config(|config| {
+        config.use_experimental_unified_exec_tool = true;
+        config.tool_output_token_limit = Some(50);
+        config
+            .features
+            .enable(Feature::UnifiedExec)
+            .expect("test config should allow feature update");
+    });
+    let test = builder.build_remote_aware(&server).await?;
+
+    let call_id = "uexec-clamped-max-output";
+    let args = serde_json::json!({
+        "cmd": "line_number=1; while [ \"$line_number\" -le 999 ]; do printf 'EXEC-LINE-%04d xxxxxxxxxxxxxxxxxxxx\\n' \"$line_number\"; line_number=$((line_number + 1)); done",
+        "yield_time_ms": 3_000,
+        "max_output_tokens": 70_000,
+    });
+
+    let responses = vec![
+        sse(vec![
+            ev_response_created("resp-1"),
+            ev_function_call(call_id, "exec_command", &serde_json::to_string(&args)?),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_response_created("resp-2"),
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_unified_exec_turn(
+        &test,
+        "run clamped max output test",
+        SandboxPolicy::DangerFullAccess,
+    )
+    .await?;
+
+    let output = wait_for_raw_unified_exec_output(&test, call_id).await?;
+    assert_eq!(output.original_token_count, Some(8_991));
+    let output_text = output.output.replace("\r\n", "\n");
+    assert_regex_match(
+        r"^Total output lines: 999\n\nEXEC-LINE-0001 x{20}\nEXEC-LINE-0002 x{20}\nEXEC-LINE-0003 x{13}…8941 tokens truncated…E-0997 x{20}\nEXEC-LINE-0998 x{20}\nEXEC-LINE-0999 x{20}\n$",
+        &output_text,
+    );
+
+    wait_for_event(&test.codex, |event| {
+        matches!(event, EventMsg::TurnComplete(_))
+    })
+    .await;
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn write_stdin_clamps_model_requested_max_output_tokens_to_policy() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+    skip_if_sandbox!(Ok(()));
+    skip_if_windows!(Ok(()));
+
+    let server = start_mock_server().await;
+
+    let mut builder = test_codex().with_model("gpt-5.4").with_config(|config| {
+        config.use_experimental_unified_exec_tool = true;
+        config.tool_output_token_limit = Some(50);
+        config
+            .features
+            .enable(Feature::UnifiedExec)
+            .expect("test config should allow feature update");
+    });
+    let test = builder.build_remote_aware(&server).await?;
+
+    let start_call_id = "uexec-stdin-clamp-start";
+    let start_args = serde_json::json!({
+        "cmd": "printf 'READY\\n'; read trigger; line_number=1; while [ \"$line_number\" -le 999 ]; do printf 'STDIN-LINE-%04d yyyyyyyyyyyyyyyyyyyy\\n' \"$line_number\"; line_number=$((line_number + 1)); done",
+        "yield_time_ms": 500,
+        "tty": true,
+    });
+
+    let stdin_call_id = "uexec-stdin-clamped-max-output";
+    let stdin_args = serde_json::json!({
+        "chars": "go\n",
+        "session_id": 1000,
+        "yield_time_ms": 3_000,
+        "max_output_tokens": 70_000,
+    });
+
+    let responses = vec![
+        sse(vec![
+            ev_response_created("resp-1"),
+            ev_function_call(
+                start_call_id,
+                "exec_command",
+                &serde_json::to_string(&start_args)?,
+            ),
+            ev_completed("resp-1"),
+        ]),
+        sse(vec![
+            ev_response_created("resp-2"),
+            ev_function_call(
+                stdin_call_id,
+                "write_stdin",
+                &serde_json::to_string(&stdin_args)?,
+            ),
+            ev_completed("resp-2"),
+        ]),
+        sse(vec![
+            ev_response_created("resp-3"),
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-3"),
+        ]),
+    ];
+    mount_sse_sequence(&server, responses).await;
+
+    submit_unified_exec_turn(
+        &test,
+        "run clamped write_stdin output test",
+        SandboxPolicy::DangerFullAccess,
+    )
+    .await?;
+
+    let start_output = wait_for_raw_unified_exec_output(&test, start_call_id).await?;
+    assert!(
+        start_output.process_id.is_some(),
+        "start command should leave a running process for write_stdin"
+    );
+
+    let stdin_output = wait_for_raw_unified_exec_output(&test, stdin_call_id).await?;
+    assert_eq!(stdin_output.original_token_count, Some(9_492));
+    let stdin_output_text = stdin_output.output.replace("\r\n", "\n");
+    assert_regex_match(
+        r"^Total output lines: 1000\n\ngo\nSTDIN-LINE-0001 y{20}\nSTDIN-LINE-0002 y{20}\nSTDIN-LINE-0003 yyyy…9442 tokens truncated…7 y{20}\nSTDIN-LINE-0998 y{20}\nSTDIN-LINE-0999 y{20}\n$",
+        &stdin_output_text,
+    );
+
+    wait_for_event(&test.codex, |event| {
+        matches!(event, EventMsg::TurnComplete(_))
+    })
+    .await;
+
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn unified_exec_defaults_to_pipe() -> Result<()> {
    skip_if_no_network!(Ok(()));