mirror of
https://github.com/openai/codex.git
synced 2026-05-02 10:26:45 +00:00
fix(exec-server): retain output until streams close (#18946)
## Why A Mac Bazel run hit a flake in `server::handler::tests::output_and_exit_are_retained_after_notification_receiver_closes` where the read path observed process exit but lost the expected buffered stdout (`first\nsecond\n`). See the [GitHub Actions job](https://github.com/openai/codex/actions/runs/24758468552/job/72436716505) and [BuildBuddy invocation](https://app.buildbuddy.io/invocation/37475a12-4ef2-45fb-ab8a-e49a2aba1d59). The underlying race is that process exit is not the same thing as stdout/stderr closure. If a child or grandchild inherits the pipe write end, or a process duplicates it with `dup2`, the watched process can exit while the stream is still open and more output can still arrive. The exec-server was starting exited-process retention cleanup from the exit event, so the process entry could be removed before the output streams had actually closed. While stress-testing the exec-server unit suite, `server::handler::tests::long_poll_read_fails_after_session_resume` exposed a separate test race: it started a short-lived command that could exit and wake the pending long-poll read before the session-resume assertion observed the resumed-session error. That test is intended to cover resume eviction, not process-exit delivery, so this change keeps the process alive and quiet while the second connection resumes the session. ## What changed - Keep exec-server process entries retained until stdout/stderr streams close, then start the post-exit retention timer from the closed event. - Wake long-poll readers when the closed event is emitted. - Add focused `local_process` unit coverage that proves late output is still retained after the short test retention interval has elapsed, and that closed process entries are eventually evicted. - Add a local and remote regression test where a parent exits while a child keeps inherited stdout open. The child waits on an explicit release file, so the test deterministically observes exit first, releases the child, then requires a nonzero-wait read from the exit sequence to receive the late output. - In `codex-rs/exec-server/src/server/handler/tests.rs`, make `long_poll_read_fails_after_session_resume` run a long-lived silent command instead of a short command that prints and exits. This isolates the test to session-resume behavior and prevents a normal process exit from satisfying the pending long-poll read first. ## Testing - `cargo test -p codex-exec-server exec_process_retains_output_after_exit_until_streams_close` - `cargo test -p codex-exec-server local_process::tests` - `cargo test -p codex-exec-server` - `just fix -p codex-exec-server` - `bazel test //codex-rs/exec-server:exec-server-unit-tests //codex-rs/exec-server:exec-server-exec_process-test //codex-rs/exec-server:exec-server-file_system-test //codex-rs/exec-server:exec-server-http_client-test //codex-rs/exec-server:exec-server-initialize-test //codex-rs/exec-server:exec-server-process-test //codex-rs/exec-server:exec-server-websocket-test` - `bazel test --runs_per_test=25 //codex-rs/exec-server:exec-server-unit-tests` ## Documentation No docs update needed; this is an internal exec-server correctness fix.
This commit is contained in:
@@ -17,11 +17,14 @@ use codex_exec_server::ReadResponse;
|
||||
use codex_exec_server::StartedExecProcess;
|
||||
use codex_exec_server::WriteStatus;
|
||||
use pretty_assertions::assert_eq;
|
||||
use tempfile::TempDir;
|
||||
use test_case::test_case;
|
||||
use tokio::sync::watch;
|
||||
use tokio::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
|
||||
use common::DELAYED_OUTPUT_AFTER_EXIT_PARENT_ARG;
|
||||
use common::current_test_binary_helper_paths;
|
||||
use common::exec_server::ExecServerHarness;
|
||||
use common::exec_server::exec_server;
|
||||
|
||||
@@ -320,6 +323,81 @@ async fn assert_exec_process_replays_events_after_close(use_remote: bool) -> Res
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn assert_exec_process_retains_output_after_exit_until_streams_close(
|
||||
use_remote: bool,
|
||||
) -> Result<()> {
|
||||
let context = create_process_context(use_remote).await?;
|
||||
let (helper_binary, _) = current_test_binary_helper_paths()?;
|
||||
let release_dir = TempDir::new()?;
|
||||
let release_path = release_dir.path().join("release-delayed-output");
|
||||
let process_id = "proc-output-after-exit".to_string();
|
||||
let session = context
|
||||
.backend
|
||||
.start(ExecParams {
|
||||
process_id: process_id.clone().into(),
|
||||
argv: vec![
|
||||
helper_binary.to_string_lossy().into_owned(),
|
||||
DELAYED_OUTPUT_AFTER_EXIT_PARENT_ARG.to_string(),
|
||||
release_path.to_string_lossy().into_owned(),
|
||||
],
|
||||
cwd: std::env::current_dir()?,
|
||||
env_policy: /*env_policy*/ None,
|
||||
env: Default::default(),
|
||||
tty: false,
|
||||
pipe_stdin: false,
|
||||
arg0: None,
|
||||
})
|
||||
.await?;
|
||||
assert_eq!(session.process.process_id().as_str(), process_id);
|
||||
|
||||
let StartedExecProcess { process } = session;
|
||||
|
||||
let exit_response = timeout(
|
||||
Duration::from_secs(2),
|
||||
process.read(
|
||||
/*after_seq*/ None,
|
||||
/*max_bytes*/ None,
|
||||
/*wait_ms*/ Some(2_000),
|
||||
),
|
||||
)
|
||||
.await??;
|
||||
assert!(
|
||||
exit_response.chunks.is_empty(),
|
||||
"parent should exit before child writes delayed output"
|
||||
);
|
||||
assert_eq!(exit_response.exit_code, Some(0));
|
||||
assert!(!exit_response.closed);
|
||||
let exit_seq = exit_response
|
||||
.next_seq
|
||||
.checked_sub(1)
|
||||
.context("exit response should advance next_seq")?;
|
||||
std::fs::write(&release_path, b"go")?;
|
||||
|
||||
let late_response = timeout(
|
||||
Duration::from_secs(2),
|
||||
process.read(
|
||||
/*after_seq*/ Some(exit_seq),
|
||||
/*max_bytes*/ None,
|
||||
/*wait_ms*/ Some(2_000),
|
||||
),
|
||||
)
|
||||
.await??;
|
||||
let mut late_output = String::new();
|
||||
for chunk in late_response.chunks {
|
||||
assert_eq!(chunk.stream, ExecOutputStream::Stdout);
|
||||
late_output.push_str(&String::from_utf8_lossy(&chunk.chunk.into_inner()));
|
||||
}
|
||||
assert_eq!(late_output, "late output after exit\n");
|
||||
|
||||
let wake_rx = process.subscribe_wake();
|
||||
let actual = collect_process_output_from_reads(process, wake_rx).await?;
|
||||
assert_eq!(
|
||||
actual,
|
||||
("late output after exit\n".to_string(), Some(0), true)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn assert_exec_process_write_then_read(use_remote: bool) -> Result<()> {
|
||||
let context = create_process_context(use_remote).await?;
|
||||
let process_id = "proc-stdin".to_string();
|
||||
@@ -586,6 +664,17 @@ async fn exec_process_replays_events_after_close(use_remote: bool) -> Result<()>
|
||||
assert_exec_process_replays_events_after_close(use_remote).await
|
||||
}
|
||||
|
||||
#[test_case(false ; "local")]
|
||||
#[test_case(true ; "remote")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
// Serialize tests that launch a real exec-server process through the full CLI.
|
||||
#[serial_test::serial(remote_exec_server)]
|
||||
async fn exec_process_retains_output_after_exit_until_streams_close(
|
||||
use_remote: bool,
|
||||
) -> Result<()> {
|
||||
assert_exec_process_retains_output_after_exit_until_streams_close(use_remote).await
|
||||
}
|
||||
|
||||
#[test_case(false ; "local")]
|
||||
#[test_case(true ; "remote")]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
|
||||
Reference in New Issue
Block a user