Allow targeted rust CI repros without sharding

Pass repro filterset through env
Add targeted rust CI repro workflow
2026-05-21 19:45:26 +00:00 · 2026-05-19 21:37:06 -07:00 · 2026-05-19 21:37:06 -07:00 · 2026-05-19 21:37:06 -07:00 · 2026-05-20 04:02:14 +00:00 · 2026-05-19 20:48:37 -07:00
15 changed files with 436 additions and 82 deletions
--- a/.github/workflows/rust-ci-full-nextest-platform.yml
+++ b/.github/workflows/rust-ci-full-nextest-platform.yml
@@ -47,6 +47,18 @@ on:
        required: false
        default: false
        type: boolean
+      nextest_filterset:
+        required: false
+        default: "all()"
+        type: string
+      selected_shard:
+        required: false
+        default: 0
+        type: number
+      repeat_count:
+        required: false
+        default: 1
+        type: number

 # Caller workflow-level env does not flow through workflow_call, so keep the
 # Cargo git transport hardening on the archive and shard jobs directly here.
@@ -76,6 +88,19 @@ jobs:
        with:
          persist-credentials: false

+      - name: Validate targeted test inputs
+        shell: bash
+        run: |
+          set -euo pipefail
+          if (( ${{ inputs.selected_shard }} < 0 || ${{ inputs.selected_shard }} > 4 )); then
+            echo "selected_shard must be between 0 and 4" >&2
+            exit 1
+          fi
+          if (( ${{ inputs.repeat_count }} < 1 )); then
+            echo "repeat_count must be at least 1" >&2
+            exit 1
+          fi
+
      - name: Configure Dev Drive (Windows)
        if: ${{ runner.os == 'Windows' }}
        shell: pwsh
@@ -287,7 +312,7 @@ jobs:
          } >> "$GITHUB_STEP_SUMMARY"

  shard:
-    name: Tests shard ${{ matrix.shard }}/4
+    name: Tests ${{ matrix.shard == 0 && 'all selected tests' || format('shard {0}/4', matrix.shard) }}
    needs: archive
    runs-on: ${{ inputs.runner_group != '' && fromJSON(format('{{"group":"{0}","labels":"{1}"}}', inputs.runner_group, inputs.runner_labels)) || inputs.runner }}
    timeout-minutes: 60
@@ -300,7 +325,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        shard: [1, 2, 3, 4]
+        shard: ${{ inputs.selected_shard == 0 && fromJSON('[0]') || fromJSON(format('[{0}]', inputs.selected_shard)) }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
@@ -391,8 +416,11 @@ jobs:
            --no-fail-fast
            --archive-file "${archive_file}"
            --workspace-remap "${workspace_root}"
-            --partition "hash:${{ matrix.shard }}/4"
+            --filterset "${NEXTEST_FILTERSET}"
          )
+          if [[ "${{ matrix.shard }}" != "0" ]]; then
+            nextest_args+=(--partition "hash:${{ matrix.shard }}/4")
+          fi
          if [[ "${{ inputs.test_threads }}" != "0" ]]; then
            nextest_args+=(--test-threads "${{ inputs.test_threads }}")
          fi
@@ -417,8 +445,12 @@ jobs:
            )
          fi

-          "${test_command[@]}"
+          for attempt in $(seq 1 "${{ inputs.repeat_count }}"); do
+            echo "nextest attempt ${attempt}/${{ inputs.repeat_count }}"
+            "${test_command[@]}"
+          done
        env:
+          NEXTEST_FILTERSET: ${{ inputs.nextest_filterset }}
          RUST_BACKTRACE: 1
          RUST_MIN_STACK: "8388608" # 8 MiB
          NEXTEST_STATUS_LEVEL: leak
--- a/.github/workflows/rust-ci-full.yml
+++ b/.github/workflows/rust-ci-full.yml
@@ -5,6 +5,34 @@ on:
      - main
      - "**full-ci**"
  workflow_dispatch:
+    inputs:
+      repro_platform:
+        description: Platform lane to run. Use all for the normal full workflow.
+        required: true
+        default: all
+        type: choice
+        options:
+          - all
+          - macos-aarch64
+          - linux-x64-remote
+          - linux-arm64
+          - windows-x64
+          - windows-arm64
+      nextest_filterset:
+        description: cargo-nextest filterset selecting the tests to run.
+        required: true
+        default: all()
+        type: string
+      shard:
+        description: Full-CI shard to reproduce. Use 0 to run selected tests without sharding.
+        required: true
+        default: 0
+        type: number
+      repeat_count:
+        description: Number of times to rerun the selected shard/filterset in one job.
+        required: true
+        default: 1
+        type: number

 # CI builds in debug (dev) for faster signal.
 env:
@@ -16,6 +44,7 @@ env:
 jobs:
  # --- CI that doesn't need specific targets ---------------------------------
  general:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' }}
    name: Format / etc
    runs-on: ubuntu-24.04
    defaults:
@@ -32,6 +61,7 @@ jobs:
        run: cargo fmt -- --config imports_granularity=Item --check

  cargo_shear:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' }}
    name: cargo shear
    runs-on: ubuntu-24.04
    defaults:
@@ -49,6 +79,7 @@ jobs:
        run: cargo shear --deny-warnings

  argument_comment_lint_package:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' }}
    name: Argument comment lint package
    runs-on: ubuntu-24.04
    env:
@@ -90,6 +121,7 @@ jobs:
          RUST_MIN_STACK: "8388608" # 8 MiB

  argument_comment_lint_prebuilt:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' }}
    name: Argument comment lint - ${{ matrix.name }}
    runs-on: ${{ matrix.runs_on || matrix.runner }}
    timeout-minutes: 30
@@ -149,6 +181,7 @@ jobs:

  # --- CI to validate on different os/targets --------------------------------
  lint_build:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' }}
    name: Lint/Build — ${{ matrix.runner }} - ${{ matrix.target }}${{ matrix.profile == 'release' && ' (release)' || '' }}
    runs-on: ${{ matrix.runs_on || matrix.runner }}
    timeout-minutes: 30
@@ -522,6 +555,7 @@ jobs:
          key: apt-${{ matrix.runner }}-${{ matrix.target }}-v1

  tests_macos_aarch64:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' || github.event.inputs.repro_platform == 'macos-aarch64' }}
    name: Tests — macos-15-xlarge - aarch64-apple-darwin
    uses: ./.github/workflows/rust-ci-full-nextest-platform.yml
    with:
@@ -530,9 +564,13 @@ jobs:
      profile: ci-test
      artifact_id: macos-aarch64
      use_sccache: true
+      nextest_filterset: ${{ github.event.inputs.nextest_filterset || 'all()' }}
+      selected_shard: ${{ fromJSON(github.event.inputs.shard || '0') }}
+      repeat_count: ${{ fromJSON(github.event.inputs.repeat_count || '1') }}
    secrets: inherit

  tests_linux_x64_remote:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' || github.event.inputs.repro_platform == 'linux-x64-remote' }}
    name: Tests — ubuntu-24.04 - x86_64-unknown-linux-gnu (remote)
    uses: ./.github/workflows/rust-ci-full-nextest-platform.yml
    with:
@@ -544,9 +582,13 @@ jobs:
      artifact_id: linux-x64-remote
      remote_env: true
      use_sccache: true
+      nextest_filterset: ${{ github.event.inputs.nextest_filterset || 'all()' }}
+      selected_shard: ${{ fromJSON(github.event.inputs.shard || '0') }}
+      repeat_count: ${{ fromJSON(github.event.inputs.repeat_count || '1') }}
    secrets: inherit

  tests_linux_arm64:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' || github.event.inputs.repro_platform == 'linux-arm64' }}
    name: Tests — ubuntu-24.04-arm - aarch64-unknown-linux-gnu
    uses: ./.github/workflows/rust-ci-full-nextest-platform.yml
    with:
@@ -557,9 +599,13 @@ jobs:
      profile: ci-test
      artifact_id: linux-arm64
      use_sccache: true
+      nextest_filterset: ${{ github.event.inputs.nextest_filterset || 'all()' }}
+      selected_shard: ${{ fromJSON(github.event.inputs.shard || '0') }}
+      repeat_count: ${{ fromJSON(github.event.inputs.repeat_count || '1') }}
    secrets: inherit

  tests_windows_x64:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' || github.event.inputs.repro_platform == 'windows-x64' }}
    name: Tests — windows-x64 - x86_64-pc-windows-msvc
    uses: ./.github/workflows/rust-ci-full-nextest-platform.yml
    with:
@@ -570,9 +616,13 @@ jobs:
      profile: ci-test
      artifact_id: windows-x64
      test_threads: 8
+      nextest_filterset: ${{ github.event.inputs.nextest_filterset || 'all()' }}
+      selected_shard: ${{ fromJSON(github.event.inputs.shard || '0') }}
+      repeat_count: ${{ fromJSON(github.event.inputs.repeat_count || '1') }}
    secrets: inherit

  tests_windows_arm64:
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.repro_platform == 'all' || github.event.inputs.repro_platform == 'windows-arm64' }}
    name: Tests — windows-arm64 - aarch64-pc-windows-msvc
    uses: ./.github/workflows/rust-ci-full-nextest-platform.yml
    with:
@@ -587,6 +637,9 @@ jobs:
      artifact_id: windows-arm64
      test_threads: 8
      use_sccache: true
+      nextest_filterset: ${{ github.event.inputs.nextest_filterset || 'all()' }}
+      selected_shard: ${{ fromJSON(github.event.inputs.shard || '0') }}
+      repeat_count: ${{ fromJSON(github.event.inputs.repeat_count || '1') }}
    secrets: inherit

  # --- Gatherer job for the full post-merge workflow --------------------------
@@ -621,16 +674,29 @@ jobs:
          echo "test arm64 : ${{ needs.tests_linux_arm64.result }}"
          echo "test winx64: ${{ needs.tests_windows_x64.result }}"
          echo "test winarm: ${{ needs.tests_windows_arm64.result }}"
-          [[ '${{ needs.argument_comment_lint_package.result }}' == 'success' ]] || { echo 'argument_comment_lint_package failed'; exit 1; }
-          [[ '${{ needs.argument_comment_lint_prebuilt.result }}' == 'success' ]] || { echo 'argument_comment_lint_prebuilt failed'; exit 1; }
-          [[ '${{ needs.general.result }}' == 'success' ]] || { echo 'general failed'; exit 1; }
-          [[ '${{ needs.cargo_shear.result }}' == 'success' ]] || { echo 'cargo_shear failed'; exit 1; }
-          [[ '${{ needs.lint_build.result }}' == 'success' ]] || { echo 'lint_build failed'; exit 1; }
-          [[ '${{ needs.tests_macos_aarch64.result }}' == 'success' ]] || { echo 'tests_macos_aarch64 failed'; exit 1; }
-          [[ '${{ needs.tests_linux_x64_remote.result }}' == 'success' ]] || { echo 'tests_linux_x64_remote failed'; exit 1; }
-          [[ '${{ needs.tests_linux_arm64.result }}' == 'success' ]] || { echo 'tests_linux_arm64 failed'; exit 1; }
-          [[ '${{ needs.tests_windows_x64.result }}' == 'success' ]] || { echo 'tests_windows_x64 failed'; exit 1; }
-          [[ '${{ needs.tests_windows_arm64.result }}' == 'success' ]] || { echo 'tests_windows_arm64 failed'; exit 1; }
+          if [[ '${{ github.event_name }}' == 'workflow_dispatch' && '${{ github.event.inputs.repro_platform }}' != 'all' ]]; then
+            selected_result=''
+            case '${{ github.event.inputs.repro_platform }}' in
+              macos-aarch64) selected_result='${{ needs.tests_macos_aarch64.result }}' ;;
+              linux-x64-remote) selected_result='${{ needs.tests_linux_x64_remote.result }}' ;;
+              linux-arm64) selected_result='${{ needs.tests_linux_arm64.result }}' ;;
+              windows-x64) selected_result='${{ needs.tests_windows_x64.result }}' ;;
+              windows-arm64) selected_result='${{ needs.tests_windows_arm64.result }}' ;;
+              *) echo 'unknown repro_platform'; exit 1 ;;
+            esac
+            [[ "${selected_result}" == 'success' ]] || { echo "selected repro platform failed: ${selected_result}"; exit 1; }
+          else
+            [[ '${{ needs.argument_comment_lint_package.result }}' == 'success' ]] || { echo 'argument_comment_lint_package failed'; exit 1; }
+            [[ '${{ needs.argument_comment_lint_prebuilt.result }}' == 'success' ]] || { echo 'argument_comment_lint_prebuilt failed'; exit 1; }
+            [[ '${{ needs.general.result }}' == 'success' ]] || { echo 'general failed'; exit 1; }
+            [[ '${{ needs.cargo_shear.result }}' == 'success' ]] || { echo 'cargo_shear failed'; exit 1; }
+            [[ '${{ needs.lint_build.result }}' == 'success' ]] || { echo 'lint_build failed'; exit 1; }
+            [[ '${{ needs.tests_macos_aarch64.result }}' == 'success' ]] || { echo 'tests_macos_aarch64 failed'; exit 1; }
+            [[ '${{ needs.tests_linux_x64_remote.result }}' == 'success' ]] || { echo 'tests_linux_x64_remote failed'; exit 1; }
+            [[ '${{ needs.tests_linux_arm64.result }}' == 'success' ]] || { echo 'tests_linux_arm64 failed'; exit 1; }
+            [[ '${{ needs.tests_windows_x64.result }}' == 'success' ]] || { echo 'tests_windows_x64 failed'; exit 1; }
+            [[ '${{ needs.tests_windows_arm64.result }}' == 'success' ]] || { echo 'tests_windows_arm64 failed'; exit 1; }
+          fi

      - name: sccache summary note
        if: always()
--- a/codex-rs/core/src/tools/context.rs
+++ b/codex-rs/core/src/tools/context.rs
@@ -311,6 +311,7 @@ pub struct ExecCommandToolOutput {
    pub wall_time: Duration,
    /// Raw bytes returned for this unified exec call before any truncation.
    pub raw_output: Vec<u8>,
+    pub truncation_policy: TruncationPolicy,
    pub max_output_tokens: Option<usize>,
    pub process_id: Option<i32>,
    pub exit_code: Option<i32>,
@@ -357,7 +358,9 @@ impl ToolOutput for ExecCommandToolOutput {
            return None;
        }

-        Some(JsonValue::String(self.truncated_output()))
+        Some(JsonValue::String(
+            self.truncated_output(self.model_output_max_tokens()),
+        ))
    }

    fn code_mode_result(&self, _payload: &ToolPayload) -> JsonValue {
@@ -381,7 +384,10 @@ impl ToolOutput for ExecCommandToolOutput {
            exit_code: self.exit_code,
            session_id: self.process_id,
            original_token_count: self.original_token_count,
-            output: self.truncated_output(),
+            output: match self.max_output_tokens {
+                Some(max_tokens) => self.truncated_output(max_tokens),
+                None => String::from_utf8_lossy(&self.raw_output).to_string(),
+            },
        };

        serde_json::to_value(result).unwrap_or_else(|err| {
@@ -391,9 +397,12 @@ impl ToolOutput for ExecCommandToolOutput {
 }

 impl ExecCommandToolOutput {
-    pub(crate) fn truncated_output(&self) -> String {
+    fn model_output_max_tokens(&self) -> usize {
+        resolve_max_tokens(self.max_output_tokens).min(self.truncation_policy.token_budget())
+    }
+
+    pub(crate) fn truncated_output(&self, max_tokens: usize) -> String {
        let text = String::from_utf8_lossy(&self.raw_output).to_string();
-        let max_tokens = resolve_max_tokens(self.max_output_tokens);
        formatted_truncate_text(&text, TruncationPolicy::Tokens(max_tokens))
    }

@@ -420,7 +429,7 @@ impl ExecCommandToolOutput {
        }

        sections.push("Output:".to_string());
-        sections.push(self.truncated_output());
+        sections.push(self.truncated_output(self.model_output_max_tokens()));

        sections.join("\n")
    }
--- a/codex-rs/core/src/tools/context_tests.rs
+++ b/codex-rs/core/src/tools/context_tests.rs
@@ -429,6 +429,7 @@ fn exec_command_tool_output_formats_truncated_response() {
        chunk_id: "abc123".to_string(),
        wall_time: std::time::Duration::from_millis(1250),
        raw_output: b"token one token two token three token four token five".to_vec(),
+        truncation_policy: TruncationPolicy::Tokens(10_000),
        max_output_tokens: Some(4),
        process_id: None,
        exit_code: Some(0),
--- a/codex-rs/core/src/tools/handlers/unified_exec.rs
+++ b/codex-rs/core/src/tools/handlers/unified_exec.rs
@@ -7,10 +7,8 @@ use crate::tools::context::ToolOutput;
 use crate::tools::context::ToolPayload;
 use crate::tools::hook_names::HookToolName;
 use crate::tools::registry::PostToolUsePayload;
-use crate::unified_exec::resolve_max_tokens;
 use codex_protocol::models::AdditionalPermissionProfile;
 use codex_tools::UnifiedExecShellMode;
-use codex_utils_output_truncation::TruncationPolicy;
 use serde::Deserialize;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -72,13 +70,6 @@ fn default_tty() -> bool {
    false
 }

-fn effective_max_output_tokens(
-    max_output_tokens: Option<usize>,
-    truncation_policy: TruncationPolicy,
-) -> usize {
-    resolve_max_tokens(max_output_tokens).min(truncation_policy.token_budget())
-}
-
 #[derive(Debug)]
 pub(crate) struct ResolvedCommand {
    pub(crate) command: Vec<String>,
--- a/codex-rs/core/src/tools/handlers/unified_exec/exec_command.rs
+++ b/codex-rs/core/src/tools/handlers/unified_exec/exec_command.rs
@@ -36,7 +36,6 @@ use super::super::shell_spec::CommandToolOptions;
 use super::super::shell_spec::create_exec_command_tool_with_environment_id;
 use super::ExecCommandArgs;
 use super::ExecCommandEnvironmentArgs;
-use super::effective_max_output_tokens;
 use super::get_command;
 use super::post_unified_exec_tool_use_payload;

@@ -162,8 +161,6 @@ impl ToolExecutor<ToolInvocation> for ExecCommandHandler {
            prefix_rule,
            ..
        } = args;
-        let max_output_tokens =
-            effective_max_output_tokens(max_output_tokens, turn.truncation_policy);

        let exec_permission_approvals_enabled =
            session.features().enabled(Feature::ExecPermissionApprovals);
@@ -241,7 +238,8 @@ impl ToolExecutor<ToolInvocation> for ExecCommandHandler {
                chunk_id: String::new(),
                wall_time: std::time::Duration::ZERO,
                raw_output: output.into_text().into_bytes(),
-                max_output_tokens: Some(max_output_tokens),
+                truncation_policy: turn.truncation_policy,
+                max_output_tokens,
                process_id: None,
                exit_code: None,
                original_token_count: None,
@@ -258,7 +256,7 @@ impl ToolExecutor<ToolInvocation> for ExecCommandHandler {
                    hook_command: hook_command.clone(),
                    process_id,
                    yield_time_ms,
-                    max_output_tokens: Some(max_output_tokens),
+                    max_output_tokens,
                    cwd,
                    sandbox_cwd: turn_environment.cwd.clone(),
                    environment,
@@ -284,7 +282,8 @@ impl ToolExecutor<ToolInvocation> for ExecCommandHandler {
                    chunk_id: generate_chunk_id(),
                    wall_time: output.duration,
                    raw_output: output_text.into_bytes(),
-                    max_output_tokens: Some(max_output_tokens),
+                    truncation_policy: turn.truncation_policy,
+                    max_output_tokens,
                    // Sandbox denial is terminal, so there is no live
                    // process for write_stdin to resume.
                    process_id: None,
--- a/codex-rs/core/src/tools/handlers/unified_exec/write_stdin.rs
+++ b/codex-rs/core/src/tools/handlers/unified_exec/write_stdin.rs
@@ -14,7 +14,6 @@ use codex_tools::ToolSpec;
 use serde::Deserialize;

 use super::super::shell_spec::create_write_stdin_tool;
-use super::effective_max_output_tokens;
 use super::post_unified_exec_tool_use_payload;

 #[derive(Debug, Deserialize)]
@@ -62,8 +61,6 @@ impl ToolExecutor<ToolInvocation> for WriteStdinHandler {
        };

        let args: WriteStdinArgs = parse_arguments(&arguments)?;
-        let max_output_tokens =
-            effective_max_output_tokens(args.max_output_tokens, turn.truncation_policy);
        let response = session
            .services
            .unified_exec_manager
@@ -71,21 +68,29 @@ impl ToolExecutor<ToolInvocation> for WriteStdinHandler {
                process_id: args.session_id,
                input: &args.chars,
                yield_time_ms: args.yield_time_ms,
-                max_output_tokens: Some(max_output_tokens),
+                max_output_tokens: args.max_output_tokens,
+                truncation_policy: turn.truncation_policy,
            })
            .await
            .map_err(|err| {
                FunctionCallError::RespondToModel(format!("write_stdin failed: {err}"))
            })?;

-        let interaction = TerminalInteractionEvent {
-            call_id: response.event_call_id.clone(),
-            process_id: args.session_id.to_string(),
-            stdin: args.chars.clone(),
-        };
-        session
-            .send_event(turn.as_ref(), EventMsg::TerminalInteraction(interaction))
-            .await;
+        // Empty stdin is a background poll, so emit it only while there is
+        // still a live process for the UI to wait on. Non-empty stdin is a real
+        // terminal interaction and should remain visible even if it completes
+        // the process before the response returns.
+        if !args.chars.is_empty() || response.process_id.is_some() {
+            let process_id = response.process_id.unwrap_or(args.session_id);
+            let interaction = TerminalInteractionEvent {
+                call_id: response.event_call_id.clone(),
+                process_id: process_id.to_string(),
+                stdin: args.chars.clone(),
+            };
+            session
+                .send_event(turn.as_ref(), EventMsg::TerminalInteraction(interaction))
+                .await;
+        }

        Ok(boxed_tool_output(response))
    }
--- a/codex-rs/core/src/tools/handlers/unified_exec_tests.rs
+++ b/codex-rs/core/src/tools/handlers/unified_exec_tests.rs
@@ -4,6 +4,7 @@ use crate::shell::default_user_shell;
 use codex_tools::UnifiedExecShellMode;
 use codex_tools::ZshForkConfig;
 use codex_utils_absolute_path::AbsolutePathBuf;
+use codex_utils_output_truncation::TruncationPolicy;
 use pretty_assertions::assert_eq;
 use std::sync::Arc;

@@ -17,6 +18,8 @@ use crate::tools::registry::CoreToolRuntime;
 use crate::turn_diff_tracker::TurnDiffTracker;
 use tokio::sync::Mutex;

+const TEST_TRUNCATION_POLICY: TruncationPolicy = TruncationPolicy::Tokens(10_000);
+
 async fn invocation_for_payload(
    tool_name: &str,
    call_id: &str,
@@ -258,6 +261,7 @@ async fn exec_command_post_tool_use_payload_uses_output_for_noninteractive_one_s
        chunk_id: "chunk-1".to_string(),
        wall_time: std::time::Duration::from_millis(498),
        raw_output: b"three".to_vec(),
+        truncation_policy: TEST_TRUNCATION_POLICY,
        max_output_tokens: None,
        process_id: None,
        exit_code: Some(0),
@@ -287,6 +291,7 @@ async fn exec_command_post_tool_use_payload_uses_output_for_interactive_completi
        chunk_id: "chunk-1".to_string(),
        wall_time: std::time::Duration::from_millis(498),
        raw_output: b"three".to_vec(),
+        truncation_policy: TEST_TRUNCATION_POLICY,
        max_output_tokens: None,
        process_id: None,
        exit_code: Some(0),
@@ -317,6 +322,7 @@ async fn exec_command_post_tool_use_payload_skips_running_sessions() {
        chunk_id: "chunk-1".to_string(),
        wall_time: std::time::Duration::from_millis(498),
        raw_output: b"three".to_vec(),
+        truncation_policy: TEST_TRUNCATION_POLICY,
        max_output_tokens: None,
        process_id: Some(45),
        exit_code: None,
@@ -342,6 +348,7 @@ async fn write_stdin_post_tool_use_payload_uses_original_exec_call_id_and_comman
        chunk_id: "chunk-2".to_string(),
        wall_time: std::time::Duration::from_millis(498),
        raw_output: b"finished\n".to_vec(),
+        truncation_policy: TEST_TRUNCATION_POLICY,
        max_output_tokens: None,
        process_id: None,
        exit_code: Some(0),
@@ -372,6 +379,7 @@ async fn write_stdin_post_tool_use_payload_keeps_parallel_session_metadata_separ
        chunk_id: "chunk-a".to_string(),
        wall_time: std::time::Duration::from_millis(498),
        raw_output: b"alpha\n".to_vec(),
+        truncation_policy: TEST_TRUNCATION_POLICY,
        max_output_tokens: None,
        process_id: None,
        exit_code: Some(0),
@@ -383,6 +391,7 @@ async fn write_stdin_post_tool_use_payload_keeps_parallel_session_metadata_separ
        chunk_id: "chunk-b".to_string(),
        wall_time: std::time::Duration::from_millis(498),
        raw_output: b"beta\n".to_vec(),
+        truncation_policy: TEST_TRUNCATION_POLICY,
        max_output_tokens: None,
        process_id: None,
        exit_code: Some(0),
--- a/codex-rs/core/src/unified_exec/mod.rs
+++ b/codex-rs/core/src/unified_exec/mod.rs
@@ -31,6 +31,7 @@ use codex_exec_server::Environment;
 use codex_network_proxy::NetworkProxy;
 use codex_protocol::models::AdditionalPermissionProfile;
 use codex_utils_absolute_path::AbsolutePathBuf;
+use codex_utils_output_truncation::TruncationPolicy;
 use rand::Rng;
 use rand::rng;
 use tokio::sync::Mutex;
@@ -111,6 +112,7 @@ pub(crate) struct WriteStdinRequest<'a> {
    pub input: &'a str,
    pub yield_time_ms: u64,
    pub max_output_tokens: Option<usize>,
+    pub truncation_policy: TruncationPolicy,
 }

 #[derive(Default)]
--- a/codex-rs/core/src/unified_exec/mod_tests.rs
+++ b/codex-rs/core/src/unified_exec/mod_tests.rs
@@ -10,6 +10,7 @@ use crate::tools::context::ExecCommandToolOutput;
 use crate::unified_exec::WriteStdinRequest;
 use crate::unified_exec::process::OutputHandles;
 use codex_sandboxing::SandboxType;
+use codex_utils_output_truncation::TruncationPolicy;
 use codex_utils_output_truncation::approx_token_count;
 use core_test_support::get_remote_test_env;
 use core_test_support::skip_if_sandbox;
@@ -162,6 +163,7 @@ async fn exec_command_with_tty(
        chunk_id: generate_chunk_id(),
        wall_time,
        raw_output: collected,
+        truncation_policy: turn.truncation_policy,
        max_output_tokens: None,
        process_id: response_process_id,
        exit_code,
@@ -195,6 +197,7 @@ async fn write_stdin(
            input,
            yield_time_ms,
            max_output_tokens: None,
+            truncation_policy: TruncationPolicy::Tokens(10_000),
        })
        .await
 }
@@ -260,7 +263,9 @@ async fn unified_exec_persists_across_requests() -> anyhow::Result<()> {
    )
    .await?;
    assert!(
-        out_2.truncated_output().contains("codex"),
+        out_2
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains("codex"),
        "expected environment variable output"
    );

@@ -301,7 +306,9 @@ async fn multi_unified_exec_sessions() -> anyhow::Result<()> {
        "short command should not report a process id if it exits quickly"
    );
    assert!(
-        !out_2.truncated_output().contains("codex"),
+        !out_2
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains("codex"),
        "short command should run in a fresh shell"
    );

@@ -313,7 +320,9 @@ async fn multi_unified_exec_sessions() -> anyhow::Result<()> {
    )
    .await?;
    assert!(
-        out_3.truncated_output().contains("codex"),
+        out_3
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains("codex"),
        "session should preserve state"
    );

@@ -350,7 +359,9 @@ async fn unified_exec_timeouts() -> anyhow::Result<()> {
    )
    .await?;
    assert!(
-        !out_2.truncated_output().contains(TEST_VAR_VALUE),
+        !out_2
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains(TEST_VAR_VALUE),
        "timeout too short should yield incomplete output"
    );

@@ -359,7 +370,9 @@ async fn unified_exec_timeouts() -> anyhow::Result<()> {
    let out_3 = write_stdin(&session, process_id, "", /*yield_time_ms*/ 100).await?;

    assert!(
-        out_3.truncated_output().contains(TEST_VAR_VALUE),
+        out_3
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains(TEST_VAR_VALUE),
        "subsequent poll should retrieve output"
    );

@@ -394,7 +407,9 @@ async fn unified_exec_pause_blocks_yield_timeout() -> anyhow::Result<()> {
        "pause should block the unified exec yield timeout"
    );
    assert!(
-        response.truncated_output().contains("unified-exec-done"),
+        response
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains("unified-exec-done"),
        "exec_command should wait for output after the pause lifts"
    );
    assert!(
@@ -420,7 +435,11 @@ async fn requests_with_large_timeout_are_capped() -> anyhow::Result<()> {
    .await?;

    assert!(result.process_id.is_some());
-    assert!(result.truncated_output().contains("codex"));
+    assert!(
+        result
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains("codex")
+    );

    Ok(())
 }
@@ -442,7 +461,11 @@ async fn completed_commands_do_not_persist_sessions() -> anyhow::Result<()> {
        result.process_id.is_some(),
        "completed command should report a process id"
    );
-    assert!(result.truncated_output().contains("codex"));
+    assert!(
+        result
+            .truncated_output(DEFAULT_MAX_OUTPUT_TOKENS)
+            .contains("codex")
+    );

    assert!(
        session
--- a/codex-rs/core/src/unified_exec/process_manager.rs
+++ b/codex-rs/core/src/unified_exec/process_manager.rs
@@ -581,6 +581,7 @@ impl UnifiedExecProcessManager {
            chunk_id,
            wall_time,
            raw_output: collected,
+            truncation_policy: context.turn.truncation_policy,
            max_output_tokens: request.max_output_tokens,
            process_id: response_process_id,
            exit_code,
@@ -691,8 +692,8 @@ impl UnifiedExecProcessManager {

        // After polling, refresh_process_state tells us whether the PTY is
        // still alive or has exited and been removed from the store; we thread
-        // that through so the handler can tag TerminalInteraction with an
-        // appropriate process_id and exit_code.
+        // that through so the handler can tag or suppress TerminalInteraction
+        // with an appropriate process_id and exit_code.
        let status = if let Some(status) = status_after_write {
            status
        } else {
@@ -725,6 +726,7 @@ impl UnifiedExecProcessManager {
            chunk_id,
            wall_time,
            raw_output: collected,
+            truncation_policy: request.truncation_policy,
            max_output_tokens: request.max_output_tokens,
            process_id,
            exit_code,
--- a/codex-rs/core/tests/suite/code_mode.rs
+++ b/codex-rs/core/tests/suite/code_mode.rs
@@ -5,6 +5,7 @@ use base64::Engine;
 use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
 use codex_config::types::McpServerConfig;
 use codex_config::types::McpServerTransportConfig;
+use codex_core::config::Config;
 use codex_features::Feature;
 use codex_login::CodexAuth;
 use codex_models_manager::bundled_models_response;
@@ -144,11 +145,21 @@ async fn run_code_mode_turn(
    server: &MockServer,
    prompt: &str,
    code: &str,
+) -> Result<(TestCodex, ResponseMock)> {
+    run_code_mode_turn_with_config(server, prompt, code, |_| {}).await
+}
+
+async fn run_code_mode_turn_with_config(
+    server: &MockServer,
+    prompt: &str,
+    code: &str,
+    configure: impl FnOnce(&mut Config) + Send + 'static,
 ) -> Result<(TestCodex, ResponseMock)> {
    let mut builder = test_codex()
        .with_model("test-gpt-5.1-codex")
        .with_config(move |config| {
            let _ = config.features.enable(Feature::CodeMode);
+            configure(config);
        });
    let test = builder.build(server).await?;

@@ -292,8 +303,7 @@ text(JSON.stringify(await tools.exec_command({ cmd: "printf code_mode_exec_marke
    )
    .await?;

-    let req = second_mock.single_request();
-    let items = custom_tool_output_items(&req, "call-1");
+    let items = custom_tool_output_items(&second_mock.single_request(), "call-1");
    assert_eq!(items.len(), 2);
    assert_regex_match(
        concat!(
@@ -645,40 +655,217 @@ text(JSON.stringify(results));

 #[cfg_attr(windows, ignore = "no exec_command on Windows")]
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn code_mode_can_truncate_final_result_with_configured_budget() -> Result<()> {
+async fn code_mode_exec_command_explicit_max_output_tokens_truncates() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = responses::start_mock_server().await;
    let (_test, second_mock) = run_code_mode_turn(
        &server,
-        "use exec to truncate the final result",
-        r#"// @exec: {"max_output_tokens": 6}
-text(JSON.stringify(await tools.exec_command({
-  cmd: "printf 'token one token two token three token four token five token six token seven'",
-  max_output_tokens: 100
-})));
+        "use exec_command from code mode",
+        r#"
+const result = await tools.exec_command({
+  cmd: "printf '0123456789012345678901234567890123456789'",
+  max_output_tokens: 5
+});
+text(result.output);
 "#,
    )
    .await?;

-    let req = second_mock.single_request();
-    let items = custom_tool_output_items(&req, "call-1");
-    assert_eq!(items.len(), 2);
-    assert_regex_match(
-        concat!(
-            r"(?s)\A",
-            r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
        ),
-        text_item(&items, /*index*/ 0),
+        "Total output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
+    );
+
+    Ok(())
+}
+
+#[cfg_attr(windows, ignore = "no exec_command on Windows")]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_exec_explicit_max_above_default_preserves_output() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use exec_command from code mode",
+        r#"// @exec: {"max_output_tokens": 20000}
+const result = await tools.exec_command({
+  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
+  max_output_tokens: 20000
+});
+text(result.output);
+"#,
+    )
+    .await?;
+
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
+        ),
+        "x".repeat(50_000)
+    );
+
+    Ok(())
+}
+
+#[cfg_attr(windows, ignore = "no exec_command on Windows")]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_exec_explicit_max_above_default_truncates_larger_output() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use exec_command from code mode",
+        r#"// @exec: {"max_output_tokens": 25000}
+const result = await tools.exec_command({
+  cmd: "python3 -c \"import sys; sys.stdout.write('A' * 90000)\"",
+  max_output_tokens: 20000
+});
+text(result.output);
+"#,
+    )
+    .await?;
+
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
+        ),
+        format!(
+            "Total output lines: 1\n\n{}…2500 tokens truncated…{}",
+            "A".repeat(40_000),
+            "A".repeat(40_000)
+        )
+    );
+
+    Ok(())
+}
+
+#[cfg_attr(windows, ignore = "no exec_command on Windows")]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_exec_explicit_max_above_truncation_policy_preserves_output() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn_with_config(
+        &server,
+        "use exec_command from code mode",
+        r#"// @exec: {"max_output_tokens": 20000}
+const result = await tools.exec_command({
+  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\"",
+  max_output_tokens: 20000
+});
+text(result.output);
+"#,
+        |config| {
+            config.tool_output_token_limit = Some(50);
+        },
+    )
+    .await?;
+
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
+        ),
+        "x".repeat(50_000)
+    );
+
+    Ok(())
+}
+
+#[cfg_attr(windows, ignore = "no exec_command on Windows")]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_exec_without_max_preserves_output_beyond_default() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use exec_command from code mode",
+        r#"// @exec: {"max_output_tokens": 20000}
+const result = await tools.exec_command({
+  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
+});
+text(result.output);
+"#,
+    )
+    .await?;
+
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
+        ),
+        "x".repeat(50_000)
+    );
+
+    Ok(())
+}
+
+#[cfg_attr(windows, ignore = "no exec_command on Windows")]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_exec_without_max_preserves_output_beyond_truncation_policy() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn_with_config(
+        &server,
+        "use exec_command from code mode",
+        r#"// @exec: {"max_output_tokens": 20000}
+const result = await tools.exec_command({
+  cmd: "python3 -c \"import sys; sys.stdout.write('x' * 50000)\""
+});
+text(result.output);
+"#,
+        |config| {
+            config.tool_output_token_limit = Some(50);
+        },
+    )
+    .await?;
+
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
+        ),
+        "x".repeat(50_000)
+    );
+
+    Ok(())
+}
+
+#[cfg_attr(windows, ignore = "no exec_command on Windows")]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_exec_explicit_max_output_tokens_truncates() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use exec_command from code mode",
+        r#"// @exec: {"max_output_tokens": 5}
+const result = await tools.exec_command({
+  cmd: "printf '0123456789012345678901234567890123456789'"
+});
+text(result.output);
+"#,
+    )
+    .await?;
+
+    assert_eq!(
+        text_item(
+            &custom_tool_output_items(&second_mock.single_request(), "call-1"),
+            /*index*/ 1
+        ),
+        "Total output lines: 1\n\n0123456789…5 tokens truncated…0123456789"
    );
-    let expected_pattern = r#"(?sx)
-\A
-Total\ output\ lines:\ 1\n
-\n
-.*…\d+\ tokens\ truncated….*
-\z
-"#;
-    assert_regex_match(expected_pattern, text_item(&items, /*index*/ 1));

    Ok(())
 }
--- a/codex-rs/core/tests/suite/unified_exec.rs
+++ b/codex-rs/core/tests/suite/unified_exec.rs
@@ -1322,6 +1322,7 @@ async fn unified_exec_emits_one_begin_and_one_end_event() -> Result<()> {

    let mut begin_events = Vec::new();
    let mut end_events = Vec::new();
+    let mut terminal_interactions = Vec::new();
    let mut task_completed = false;
    loop {
        let event_msg = wait_for_event(&test.codex, |_| true).await;
@@ -1332,6 +1333,9 @@ async fn unified_exec_emits_one_begin_and_one_end_event() -> Result<()> {
            EventMsg::ExecCommandEnd(event) if event.call_id == open_call_id => {
                end_events.push(event);
            }
+            EventMsg::TerminalInteraction(event) if event.call_id == open_call_id => {
+                terminal_interactions.push(event);
+            }
            EventMsg::TurnComplete(_) => {
                task_completed = true;
            }
@@ -1353,6 +1357,10 @@ async fn unified_exec_emits_one_begin_and_one_end_event() -> Result<()> {
        1,
        "expected end event for the write_stdin call"
    );
+    assert!(
+        terminal_interactions.is_empty(),
+        "completed empty polls should not emit terminal interactions: {terminal_interactions:?}"
+    );

    let open_event = &begin_events[0];

--- a/codex-rs/tui/src/chatwidget/command_lifecycle.rs
+++ b/codex-rs/tui/src/chatwidget/command_lifecycle.rs
@@ -76,12 +76,16 @@ impl ChatWidget {
        if !self.bottom_pane.is_task_running() {
            return;
        }
-        self.flush_answer_stream_with_separator();
        let command_display = self
            .unified_exec_processes
            .iter()
            .find(|process| process.key == process_id)
            .map(|process| process.command_display.clone());
+        if stdin.is_empty() && command_display.is_none() {
+            return;
+        }
+
+        self.flush_answer_stream_with_separator();
        if stdin.is_empty() {
            // Empty stdin means we are polling for background output.
            // Surface this in the status indicator (single "waiting" surface) instead of
--- a/codex-rs/tui/src/chatwidget/tests/exec_flow.rs
+++ b/codex-rs/tui/src/chatwidget/tests/exec_flow.rs
@@ -719,6 +719,22 @@ async fn unified_exec_wait_status_header_updates_on_late_command_display() {
    assert_eq!(status.details(), Some("sleep 5"));
 }

+#[tokio::test]
+async fn unified_exec_empty_poll_for_finished_process_does_not_show_waiting_status() {
+    let (mut chat, _rx, _op_rx) = make_chatwidget_manual(/*model_override*/ None).await;
+    chat.on_task_started();
+
+    terminal_interaction(&mut chat, "call-finished", "proc-finished", "");
+
+    assert_eq!(chat.status_state.current_status.header, "Working");
+    let status = chat
+        .bottom_pane
+        .status_widget()
+        .expect("task status indicator should remain visible");
+    assert_eq!(status.header(), "Working");
+    assert!(chat.unified_exec_wait_streak.is_none());
+}
+
 #[tokio::test]
 async fn unified_exec_waiting_multiple_empty_snapshots() {
    let (mut chat, mut rx, _op_rx) = make_chatwidget_manual(/*model_override*/ None).await;
Author	SHA1	Message	Date
starr-openai	c101891009	Allow targeted rust CI repros without sharding	2026-05-19 21:37:06 -07:00
starr-openai	19e75b3299	Pass repro filterset through env	2026-05-19 21:37:06 -07:00
starr-openai	842483a85e	Add targeted rust CI repro workflow	2026-05-19 21:37:06 -07:00
Ahmed Ibrahim	5a4202ad90	[codex] Preserve raw code-mode exec output by default (#23564 ) ## Why Code mode can use nested unified exec calls as data sources. When those calls omit `max_output_tokens`, code mode should receive raw command output so the script can parse or summarize it itself. When code mode does provide `max_output_tokens`, that explicit nested budget should be respected, including values above the default unified exec limit, rather than being capped before code mode sees the result. ## What - Preserve direct unified exec truncation behavior, while letting code-mode exec/write_stdin keep `max_output_tokens` as `None` unless explicitly supplied. - Make code-mode tool results use raw output when no explicit limit is present, and use the explicit nested limit directly when one is specified. - Refactor unified exec output formatting so `truncated_output` takes the caller-selected token budget. - Add e2e integration coverage for explicit nested exec limits, omitted nested exec limits, outer exec limit propagation, omitted-limit outputs that exceed both the default and a small truncation policy, explicit nested limits above those caps, and high explicit limits that still compact larger command output. - Reuse the code-mode turn setup helper while directly asserting the exact exec output item in each test. ## Testing - `just fmt` - `git diff --check` - Not run locally per repo guidance; CI should validate the e2e integration tests.	2026-05-20 04:02:14 +00:00
Eric Traut	e43a2e297f	Fix stale background terminal poll events (#23231 ) ## Why Issue #23214 reports `/ps` showing no background terminals while the status line still says it is waiting for a background terminal. The race is in core: `write_stdin` can poll a process that exits before the response returns. The process manager correctly returns `process_id: None`, but the handler still emitted a `TerminalInteraction` event using the requested session id, causing clients to believe a dead process was still being polled. Fixes #23214. ## What changed - Suppress `TerminalInteraction` events for empty `write_stdin` polls once `response.process_id` is `None`. - Continue emitting interactions for non-empty stdin, even if that input causes the process to exit before the response returns. - Extend the unified exec integration test to assert completed empty polls do not emit terminal interactions. ## Verification - `cargo test -p codex-core --test all unified_exec_emits_one_begin_and_one_end_event` - `cargo test -p codex-core --test all unified_exec_emits_terminal_interaction_for_write_stdin` `cargo test -p codex-core` currently aborts in unrelated `agent::control::tests::resume_agent_from_rollout_uses_edge_data_when_descendant_metadata_source_is_stale` with a reproducible stack overflow.	2026-05-19 20:48:37 -07:00