Stabilize Windows cmd-based shell test harnesses (#14958)

## What is flaky The Windows shell-driven integration tests in `codex-rs/core` were intermittently unstable, especially: - `apply_patch_cli_can_use_shell_command_output_as_patch_input` - `websocket_test_codex_shell_chain` - `websocket_v2_test_codex_shell_chain` ## Why it was flaky These tests were exercising real shell-tool flows through whichever shell Codex selected on Windows, and the `apply_patch` test also nested a PowerShell read inside `cmd /c`. There were multiple independent sources of nondeterminism in that setup: - The test harness depended on the model-selected Windows shell instead of pinning the shell it actually meant to exercise. - `cmd.exe /c powershell.exe -Command "..."` is quoting-sensitive; on CI that could leave the read command wrapped as a literal string instead of executing it. - Even after getting the quoting right, PowerShell could emit CLIXML progress records like module-initialization output onto stdout. - The `apply_patch` test was building a patch directly from shell stdout, so any quoting artifact or progress noise corrupted the patch input. So the failures were driven by shell startup and output-shape variance, not by the `apply_patch` or websocket logic themselves. ## How this PR fixes it - Add a test-only `user_shell_override` path so Windows integration tests can pin `cmd.exe` explicitly. - Use that override in the websocket shell-chain tests and in the `apply_patch` harness. - Change the nested Windows file read in `apply_patch_cli_can_use_shell_command_output_as_patch_input` to a UTF-8 PowerShell `-EncodedCommand` script. - Run that nested PowerShell process with `-NonInteractive`, set `$ProgressPreference = 'SilentlyContinue'`, and read the file with `[System.IO.File]::ReadAllText(...)`. ## Why this fix fixes the flakiness The outer harness now runs under a deterministic shell, and the inner PowerShell read no longer depends on fragile `cmd` quoting or on progress output staying quiet by accident. The shell tool returns only the file contents, so patch construction and websocket assertions depend on stable test inputs instead of on runner-specific shell behavior. --------- Co-authored-by: Ahmed Ibrahim <219906144+aibrahim-oai@users.noreply.github.com> Co-authored-by: Codex <noreply@openai.com>
2026-04-26 15:45:02 +00:00 · 2026-03-17 13:21:46 -07:00
parent 683c37ce75
commit b02388672f
9 changed files with 160 additions and 13 deletions
--- a/codex-rs/core/tests/common/test_codex.rs
+++ b/codex-rs/core/tests/common/test_codex.rs
@@ -13,6 +13,8 @@ use codex_core::built_in_model_providers;
 use codex_core::config::Config;
 use codex_core::features::Feature;
 use codex_core::models_manager::collaboration_mode_presets::CollaborationModesConfig;
+use codex_core::shell::Shell;
+use codex_core::shell::get_shell_by_model_provided_path;
 use codex_protocol::config_types::ServiceTier;
 use codex_protocol::openai_models::ModelsResponse;
 use codex_protocol::protocol::AskForApproval;
@@ -64,6 +66,7 @@ pub struct TestCodexBuilder {
    auth: CodexAuth,
    pre_build_hooks: Vec<Box<PreBuildHook>>,
    home: Option<Arc<TempDir>>,
+    user_shell_override: Option<Shell>,
 }

 impl TestCodexBuilder {
@@ -100,6 +103,19 @@ impl TestCodexBuilder {
        self
    }

+    pub fn with_user_shell(mut self, user_shell: Shell) -> Self {
+        self.user_shell_override = Some(user_shell);
+        self
+    }
+
+    pub fn with_windows_cmd_shell(self) -> Self {
+        if cfg!(windows) {
+            self.with_user_shell(get_shell_by_model_provided_path(&PathBuf::from("cmd.exe")))
+        } else {
+            self
+        }
+    }
+
    pub async fn build(&mut self, server: &wiremock::MockServer) -> anyhow::Result<TestCodex> {
        let home = match self.home.clone() {
            Some(home) => home,
@@ -199,9 +215,23 @@ impl TestCodexBuilder {
            )
        };
        let thread_manager = Arc::new(thread_manager);
+        let user_shell_override = self.user_shell_override.clone();

-        let new_conversation = match resume_from {
-            Some(path) => {
+        let new_conversation = match (resume_from, user_shell_override) {
+            (Some(path), Some(user_shell_override)) => {
+                let auth_manager = codex_core::test_support::auth_manager_from_auth(auth);
+                Box::pin(
+                    codex_core::test_support::resume_thread_from_rollout_with_user_shell_override(
+                        thread_manager.as_ref(),
+                        config.clone(),
+                        path,
+                        auth_manager,
+                        user_shell_override,
+                    ),
+                )
+                .await?
+            }
+            (Some(path), None) => {
                let auth_manager = codex_core::test_support::auth_manager_from_auth(auth);
                Box::pin(thread_manager.resume_thread_from_rollout(
                    config.clone(),
@@ -211,7 +241,17 @@ impl TestCodexBuilder {
                ))
                .await?
            }
-            None => Box::pin(thread_manager.start_thread(config.clone())).await?,
+            (None, Some(user_shell_override)) => {
+                Box::pin(
+                    codex_core::test_support::start_thread_with_user_shell_override(
+                        thread_manager.as_ref(),
+                        config.clone(),
+                        user_shell_override,
+                    ),
+                )
+                .await?
+            }
+            (None, None) => Box::pin(thread_manager.start_thread(config.clone())).await?,
        };

        Ok(TestCodex {
@@ -562,6 +602,7 @@ pub fn test_codex() -> TestCodexBuilder {
        auth: CodexAuth::from_api_key("dummy"),
        pre_build_hooks: vec![],
        home: None,
+        user_shell_override: None,
    }
 }