fix: race in js repl (#11922)

js_repl_reset previously raced with in-flight/new js_repl executions because reset() could clear exec_tool_calls without synchronizing with execute(). In that window, a running exec could lose its per-exec tool-call context, and subsequent kernel RunTool messages would fail with js_repl exec context not found. The fix serializes reset and execute on the same exec_lock, so reset cannot run concurrently with exec setup/teardown. We also keep the timeout path safe by performing reset steps inline while execute() already holds the lock, avoiding re-entrant lock acquisition. A regression test now verifies that reset waits for the exec lock and does not clear tool-call state early.
2026-04-24 22:54:54 +00:00 · 2026-02-17 11:06:14 +00:00
parent b994b52994
commit 77f74a5c17
1 changed files with 46 additions and 1 deletions
--- a/codex-rs/core/src/tools/js_repl/mod.rs
+++ b/codex-rs/core/src/tools/js_repl/mod.rs
@@ -399,6 +399,9 @@ impl JsReplManager {
    }

    pub async fn reset(&self) -> Result<(), FunctionCallError> {
+        let _permit = self.exec_lock.clone().acquire_owned().await.map_err(|_| {
+            FunctionCallError::RespondToModel("js_repl execution unavailable".to_string())
+        })?;
        self.reset_kernel().await;
        Self::clear_all_exec_tool_calls_map(&self.exec_tool_calls).await;
        Ok(())
@@ -527,7 +530,9 @@ impl JsReplManager {
                return Err(FunctionCallError::RespondToModel(message));
            }
            Err(_) => {
-                self.reset().await?;
+                self.reset_kernel().await;
+                self.wait_for_exec_tool_calls(&req_id).await;
+                self.exec_tool_calls.lock().await.clear();
                return Err(FunctionCallError::RespondToModel(
                    "js_repl execution timed out; kernel reset, rerun your request".to_string(),
                ));
@@ -1456,6 +1461,46 @@ mod tests {
        }
    }

+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn reset_waits_for_exec_lock_before_clearing_exec_tool_calls() {
+        let manager = JsReplManager::new(None, PathBuf::from("."))
+            .await
+            .expect("manager should initialize");
+        let permit = manager
+            .exec_lock
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("lock should be acquirable");
+        let exec_id = Uuid::new_v4().to_string();
+        manager.register_exec_tool_calls(&exec_id).await;
+
+        let reset_manager = Arc::clone(&manager);
+        let mut reset_task = tokio::spawn(async move { reset_manager.reset().await });
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        assert!(
+            !reset_task.is_finished(),
+            "reset should wait until execute lock is released"
+        );
+        assert!(
+            manager.exec_tool_calls.lock().await.contains_key(&exec_id),
+            "reset must not clear tool-call contexts while execute lock is held"
+        );
+
+        drop(permit);
+
+        tokio::time::timeout(Duration::from_secs(1), &mut reset_task)
+            .await
+            .expect("reset should complete after execute lock release")
+            .expect("reset task should not panic")
+            .expect("reset should succeed");
+        assert!(
+            !manager.exec_tool_calls.lock().await.contains_key(&exec_id),
+            "reset should clear tool-call contexts after lock acquisition"
+        );
+    }
+
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn reset_clears_inflight_exec_tool_calls_without_waiting() {
        let manager = JsReplManager::new(None, std::env::temp_dir())