fix(exec-server): retain output until streams close (#18946)

## Why

A Mac Bazel run hit a flake in
`server::handler::tests::output_and_exit_are_retained_after_notification_receiver_closes`
where the read path observed process exit but lost the expected buffered
stdout (`first\nsecond\n`). See the [GitHub Actions
job](https://github.com/openai/codex/actions/runs/24758468552/job/72436716505)
and [BuildBuddy
invocation](https://app.buildbuddy.io/invocation/37475a12-4ef2-45fb-ab8a-e49a2aba1d59).

The underlying race is that process exit is not the same thing as
stdout/stderr closure. If a child or grandchild inherits the pipe write
end, or a process duplicates it with `dup2`, the watched process can
exit while the stream is still open and more output can still arrive.
The exec-server was starting exited-process retention cleanup from the
exit event, so the process entry could be removed before the output
streams had actually closed.

While stress-testing the exec-server unit suite,
`server::handler::tests::long_poll_read_fails_after_session_resume`
exposed a separate test race: it started a short-lived command that
could exit and wake the pending long-poll read before the session-resume
assertion observed the resumed-session error. That test is intended to
cover resume eviction, not process-exit delivery, so this change keeps
the process alive and quiet while the second connection resumes the
session.

## What changed

- Keep exec-server process entries retained until stdout/stderr streams
close, then start the post-exit retention timer from the closed event.
- Wake long-poll readers when the closed event is emitted.
- Add focused `local_process` unit coverage that proves late output is
still retained after the short test retention interval has elapsed, and
that closed process entries are eventually evicted.
- Add a local and remote regression test where a parent exits while a
child keeps inherited stdout open. The child waits on an explicit
release file, so the test deterministically observes exit first,
releases the child, then requires a nonzero-wait read from the exit
sequence to receive the late output.
- In `codex-rs/exec-server/src/server/handler/tests.rs`, make
`long_poll_read_fails_after_session_resume` run a long-lived silent
command instead of a short command that prints and exits. This isolates
the test to session-resume behavior and prevents a normal process exit
from satisfying the pending long-poll read first.

## Testing

- `cargo test -p codex-exec-server
exec_process_retains_output_after_exit_until_streams_close`
- `cargo test -p codex-exec-server local_process::tests`
- `cargo test -p codex-exec-server`
- `just fix -p codex-exec-server`
- `bazel test //codex-rs/exec-server:exec-server-unit-tests
//codex-rs/exec-server:exec-server-exec_process-test
//codex-rs/exec-server:exec-server-file_system-test
//codex-rs/exec-server:exec-server-http_client-test
//codex-rs/exec-server:exec-server-initialize-test
//codex-rs/exec-server:exec-server-process-test
//codex-rs/exec-server:exec-server-websocket-test`
- `bazel test --runs_per_test=25
//codex-rs/exec-server:exec-server-unit-tests`

## Documentation

No docs update needed; this is an internal exec-server correctness fix.
This commit is contained in:
Michael Bolin
2026-04-23 12:49:58 -07:00
committed by GitHub
parent 9c0eced391
commit 491a3058f6
4 changed files with 426 additions and 22 deletions

View File

@@ -17,11 +17,14 @@ use codex_exec_server::ReadResponse;
use codex_exec_server::StartedExecProcess;
use codex_exec_server::WriteStatus;
use pretty_assertions::assert_eq;
use tempfile::TempDir;
use test_case::test_case;
use tokio::sync::watch;
use tokio::time::Duration;
use tokio::time::timeout;
use common::DELAYED_OUTPUT_AFTER_EXIT_PARENT_ARG;
use common::current_test_binary_helper_paths;
use common::exec_server::ExecServerHarness;
use common::exec_server::exec_server;
@@ -320,6 +323,81 @@ async fn assert_exec_process_replays_events_after_close(use_remote: bool) -> Res
Ok(())
}
async fn assert_exec_process_retains_output_after_exit_until_streams_close(
use_remote: bool,
) -> Result<()> {
let context = create_process_context(use_remote).await?;
let (helper_binary, _) = current_test_binary_helper_paths()?;
let release_dir = TempDir::new()?;
let release_path = release_dir.path().join("release-delayed-output");
let process_id = "proc-output-after-exit".to_string();
let session = context
.backend
.start(ExecParams {
process_id: process_id.clone().into(),
argv: vec![
helper_binary.to_string_lossy().into_owned(),
DELAYED_OUTPUT_AFTER_EXIT_PARENT_ARG.to_string(),
release_path.to_string_lossy().into_owned(),
],
cwd: std::env::current_dir()?,
env_policy: /*env_policy*/ None,
env: Default::default(),
tty: false,
pipe_stdin: false,
arg0: None,
})
.await?;
assert_eq!(session.process.process_id().as_str(), process_id);
let StartedExecProcess { process } = session;
let exit_response = timeout(
Duration::from_secs(2),
process.read(
/*after_seq*/ None,
/*max_bytes*/ None,
/*wait_ms*/ Some(2_000),
),
)
.await??;
assert!(
exit_response.chunks.is_empty(),
"parent should exit before child writes delayed output"
);
assert_eq!(exit_response.exit_code, Some(0));
assert!(!exit_response.closed);
let exit_seq = exit_response
.next_seq
.checked_sub(1)
.context("exit response should advance next_seq")?;
std::fs::write(&release_path, b"go")?;
let late_response = timeout(
Duration::from_secs(2),
process.read(
/*after_seq*/ Some(exit_seq),
/*max_bytes*/ None,
/*wait_ms*/ Some(2_000),
),
)
.await??;
let mut late_output = String::new();
for chunk in late_response.chunks {
assert_eq!(chunk.stream, ExecOutputStream::Stdout);
late_output.push_str(&String::from_utf8_lossy(&chunk.chunk.into_inner()));
}
assert_eq!(late_output, "late output after exit\n");
let wake_rx = process.subscribe_wake();
let actual = collect_process_output_from_reads(process, wake_rx).await?;
assert_eq!(
actual,
("late output after exit\n".to_string(), Some(0), true)
);
Ok(())
}
async fn assert_exec_process_write_then_read(use_remote: bool) -> Result<()> {
let context = create_process_context(use_remote).await?;
let process_id = "proc-stdin".to_string();
@@ -586,6 +664,17 @@ async fn exec_process_replays_events_after_close(use_remote: bool) -> Result<()>
assert_exec_process_replays_events_after_close(use_remote).await
}
#[test_case(false ; "local")]
#[test_case(true ; "remote")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
// Serialize tests that launch a real exec-server process through the full CLI.
#[serial_test::serial(remote_exec_server)]
async fn exec_process_retains_output_after_exit_until_streams_close(
use_remote: bool,
) -> Result<()> {
assert_exec_process_retains_output_after_exit_until_streams_close(use_remote).await
}
#[test_case(false ; "local")]
#[test_case(true ; "remote")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]