fix: attempt to reduce high cpu usage when using collab (#9776)

Reproduce with a prompt like this with collab enabled: ``` Examine the code at <some subdirectory with a deeply nested project>. Find the most urgent issue to resolve and describe it to me. ``` Existing behavior causes the top-level agent to busy wait on subagents.
2026-02-01 22:47:52 +00:00 · 2026-01-26 10:07:25 -08:00
parent fdc69df454
commit 375a5ef051
3 changed files with 40 additions and 3 deletions
--- a/codex-rs/core/src/tools/handlers/collab.rs
+++ b/codex-rs/core/src/tools/handlers/collab.rs
@@ -28,6 +28,8 @@ use serde::Serialize;

 pub struct CollabHandler;

+/// Minimum wait timeout to prevent tight polling loops from burning CPU.
+pub(crate) const MIN_WAIT_TIMEOUT_MS: i64 = 10_000;
 pub(crate) const DEFAULT_WAIT_TIMEOUT_MS: i64 = 30_000;
 pub(crate) const MAX_WAIT_TIMEOUT_MS: i64 = 300_000;

@@ -323,6 +325,8 @@ mod wait {
            .collect::<Result<Vec<_>, _>>()?;

        // Validate timeout.
+        // Very short timeouts encourage busy-polling loops in the orchestrator prompt and can
+        // cause high CPU usage even with a single active worker, so clamp to a minimum.
        let timeout_ms = args.timeout_ms.unwrap_or(DEFAULT_WAIT_TIMEOUT_MS);
        let timeout_ms = match timeout_ms {
            ms if ms <= 0 => {
@@ -330,7 +334,7 @@ mod wait {
                    "timeout_ms must be greater than zero".to_owned(),
                ));
            }
-            ms => ms.min(MAX_WAIT_TIMEOUT_MS),
+            ms => ms.clamp(MIN_WAIT_TIMEOUT_MS, MAX_WAIT_TIMEOUT_MS),
        };

        session
@@ -1012,7 +1016,7 @@ mod tests {
            "wait",
            function_payload(json!({
                "ids": [agent_id.to_string()],
-                "timeout_ms": 10
+                "timeout_ms": MIN_WAIT_TIMEOUT_MS
            })),
        );
        let output = CollabHandler
@@ -1043,6 +1047,37 @@ mod tests {
            .expect("shutdown should submit");
    }

+    #[tokio::test]
+    async fn wait_clamps_short_timeouts_to_minimum() {
+        let (mut session, turn) = make_session_and_context().await;
+        let manager = thread_manager();
+        session.services.agent_control = manager.agent_control();
+        let config = turn.client.config().as_ref().clone();
+        let thread = manager.start_thread(config).await.expect("start thread");
+        let agent_id = thread.thread_id;
+        let invocation = invocation(
+            Arc::new(session),
+            Arc::new(turn),
+            "wait",
+            function_payload(json!({
+                "ids": [agent_id.to_string()],
+                "timeout_ms": 10
+            })),
+        );
+
+        let early = timeout(Duration::from_millis(50), CollabHandler.handle(invocation)).await;
+        assert!(
+            early.is_err(),
+            "wait should not return before the minimum timeout clamp"
+        );
+
+        let _ = thread
+            .thread
+            .submit(Op::Shutdown {})
+            .await
+            .expect("shutdown should submit");
+    }
+
    #[tokio::test]
    async fn wait_returns_final_status_without_timeout() {
        let (mut session, turn) = make_session_and_context().await;
--- a/codex-rs/core/src/tools/spec.rs
+++ b/codex-rs/core/src/tools/spec.rs
@@ -8,6 +8,7 @@ use crate::tools::handlers::apply_patch::create_apply_patch_freeform_tool;
 use crate::tools::handlers::apply_patch::create_apply_patch_json_tool;
 use crate::tools::handlers::collab::DEFAULT_WAIT_TIMEOUT_MS;
 use crate::tools::handlers::collab::MAX_WAIT_TIMEOUT_MS;
+use crate::tools::handlers::collab::MIN_WAIT_TIMEOUT_MS;
 use crate::tools::registry::ToolRegistryBuilder;
 use codex_protocol::config_types::WebSearchMode;
 use codex_protocol::dynamic_tools::DynamicToolSpec;
@@ -517,7 +518,7 @@ fn create_wait_tool() -> ToolSpec {
        "timeout_ms".to_string(),
        JsonSchema::Number {
            description: Some(format!(
-                "Optional timeout in milliseconds. Defaults to {DEFAULT_WAIT_TIMEOUT_MS} and max {MAX_WAIT_TIMEOUT_MS}."
+                "Optional timeout in milliseconds. Defaults to {DEFAULT_WAIT_TIMEOUT_MS}, min {MIN_WAIT_TIMEOUT_MS}, and max {MAX_WAIT_TIMEOUT_MS}. Avoid tight polling loops; prefer longer waits (seconds to minutes)."
            )),
        },
    );
--- a/codex-rs/core/templates/agents/orchestrator.md
+++ b/codex-rs/core/templates/agents/orchestrator.md
@@ -52,6 +52,7 @@ You are Codex Orchestrator, based on GPT-5. You are running as an orchestration
 * Workers must not revert, overwrite, or conflict with others’ work.
 * By default, workers must not spawn sub-agents unless explicitly allowed.
 * When multiple workers are active, you may pass multiple IDs to `wait` to react to the first completion and keep the workflow event-driven and use a long timeout (e.g. 5 minutes).
+* Do not busy-poll `wait` with very short timeouts. Prefer waits measured in seconds (or minutes) so the system is idle while workers run.

 ## Collab tools