# PR #1852: exec: timeout on grandchildren - URL: https://github.com/openai/codex/pull/1852 - Author: md-oai - Created: 2025-08-05 18:04:58 UTC - Updated: 2025-08-05 21:18:43 UTC - Changes: +43/-2, Files changed: 1, Commits: 3 ## Description We were enforcing the 10 s wall-clock limit only on the child process. If that child (bash) spawns grandchildren and we kill it on timeout, those grandchildren still have the original stdout/err pipe open, so the background tasks that are draining the pipes block forever ## Full Diff ```diff diff --git a/codex-rs/core/src/exec.rs b/codex-rs/core/src/exec.rs index dce02cc5e2..a8bbd0f52c 100644 --- a/codex-rs/core/src/exec.rs +++ b/codex-rs/core/src/exec.rs @@ -344,8 +344,49 @@ pub(crate) async fn consume_truncated_output( } }; - let stdout = stdout_handle.await??; - let stderr = stderr_handle.await??; + // Wait for the stdout/stderr collection tasks but guard against them + // hanging forever. In the normal case both pipes are closed once the child + // terminates so the tasks exit quickly. However, if the child process + // spawned grandchildren that inherited its stdout/stderr file descriptors + // those pipes may stay open after we `kill` the direct child on timeout. + // That would cause the `read_capped` tasks to block on `read()` + // indefinitely, effectively hanging the whole agent. + + const IO_DRAIN_TIMEOUT_MS: u64 = 2_000; // 2 s should be plenty for local pipes + + // We need mutable bindings so we can `abort()` them on timeout. + use tokio::task::JoinHandle; + + async fn await_with_timeout( + handle: &mut JoinHandle>>, + timeout: Duration, + ) -> std::io::Result> { + match tokio::time::timeout(timeout, &mut *handle).await { + Ok(join_res) => match join_res { + Ok(io_res) => io_res, + Err(join_err) => Err(std::io::Error::other(join_err)), + }, + Err(_elapsed) => { + // Timeout: abort the task to avoid hanging on open pipes. + handle.abort(); + Ok(Vec::new()) + } + } + } + + let mut stdout_handle = stdout_handle; + let mut stderr_handle = stderr_handle; + + let stdout = await_with_timeout( + &mut stdout_handle, + Duration::from_millis(IO_DRAIN_TIMEOUT_MS), + ) + .await?; + let stderr = await_with_timeout( + &mut stderr_handle, + Duration::from_millis(IO_DRAIN_TIMEOUT_MS), + ) + .await?; Ok(RawExecToolCallOutput { exit_status, ``` ## Review Comments ### codex-rs/core/src/exec.rs - Created: 2025-08-05 18:42:23 UTC | Link: https://github.com/openai/codex/pull/1852#discussion_r2255069686 ```diff @@ -344,8 +344,50 @@ pub(crate) async fn consume_truncated_output( } }; - let stdout = stdout_handle.await??; - let stderr = stderr_handle.await??; + // Wait for the stdout/stderr collection tasks but guard against them + // hanging forever. In the normal case both pipes are closed once the child + // terminates so the tasks exit quickly. However, if the child process + // spawned grandchildren that inherited its stdout/stderr file descriptors + // those pipes may stay open after we `kill` the direct child on timeout. + // That would cause the `read_capped` tasks to block on `read()` + // indefinitely, effectively hanging the whole agent. + + const IO_DRAIN_TIMEOUT_MS: u64 = 2_000; // 2 s should be plenty for local pipes + + // We need mutable bindings so we can `abort()` them on timeout. + use tokio::task::JoinHandle; + + async fn await_with_timeout( + handle: &mut JoinHandle>>, + timeout: Duration, + ) -> std::io::Result> { + tokio::select! { + join_res = &mut *handle => { + match join_res { + Ok(io_res) => io_res, + Err(join_err) => Err(std::io::Error::other(join_err)), + } + }, + _ = tokio::time::sleep(timeout) => { ``` > Can/should we use `use tokio::time::timeout` instead?