Make benchmarks focused on the time spent on commands.

Something that seems to work at least.
2026-02-01 22:47:52 +00:00 · 2025-12-04 15:02:02 +07:00 · 2025-12-04 14:07:15 +07:00
12 changed files with 939 additions and 14 deletions
--- a/codex-rs/core/examples/non_login_shell_bench.rs
+++ b/codex-rs/core/examples/non_login_shell_bench.rs
@@ -0,0 +1,80 @@
+use std::collections::HashMap;
+use std::time::Duration;
+use std::time::Instant;
+
+use codex_core::exec::DEFAULT_EXEC_COMMAND_TIMEOUT_MS;
+use codex_core::exec::ExecExpiration;
+use codex_core::exec::ExecParams;
+use codex_core::exec::process_exec_tool_call;
+use codex_core::protocol::SandboxPolicy;
+use codex_core::shell::default_user_shell;
+
+fn parse_iterations() -> usize {
+    let mut args = std::env::args().skip(1);
+    let mut iterations = 5usize;
+    while let Some(arg) = args.next() {
+        if arg == "--iterations" {
+            if let Some(value) = args.next() {
+                if let Ok(parsed) = value.parse::<usize>() {
+                    iterations = parsed.max(1);
+                }
+            }
+        }
+    }
+    iterations
+}
+
+async fn measure(command: &str, use_login_shell: bool) -> anyhow::Result<Duration> {
+    let cwd = std::env::current_dir()?;
+    let env: HashMap<String, String> = std::env::vars().collect();
+    let shell = default_user_shell();
+    let args = shell.derive_exec_args(command, use_login_shell);
+    let params = ExecParams {
+        command: args,
+        cwd,
+        expiration: ExecExpiration::from(DEFAULT_EXEC_COMMAND_TIMEOUT_MS),
+        env,
+        with_escalated_permissions: None,
+        justification: None,
+        arg0: None,
+    };
+    let start = Instant::now();
+    let _ = process_exec_tool_call(
+        params,
+        &SandboxPolicy::DangerFullAccess,
+        &std::env::current_dir()?,
+        &None,
+        None,
+    )
+    .await?;
+    Ok(start.elapsed())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let iterations = parse_iterations();
+    let commands = ["ls", "rg --files", "git status"];
+
+    println!("Non-login shell heuristic benchmark (iterations: {iterations})");
+    for command in commands {
+        let mut login_times = Vec::with_capacity(iterations);
+        let mut non_login_times = Vec::with_capacity(iterations);
+        for _ in 0..iterations {
+            login_times.push(measure(command, true).await?);
+            non_login_times.push(measure(command, false).await?);
+        }
+        let avg_login_ms =
+            login_times.iter().map(Duration::as_secs_f64).sum::<f64>() * 1000.0 / iterations as f64;
+        let avg_non_login_ms = non_login_times
+            .iter()
+            .map(Duration::as_secs_f64)
+            .sum::<f64>()
+            * 1000.0
+            / iterations as f64;
+        println!(
+            "{command:12} login: {avg_login_ms:>8.3} ms | non-login: {avg_non_login_ms:>8.3} ms"
+        );
+    }
+
+    Ok(())
+}
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -289,6 +289,8 @@ pub(crate) struct TurnContext {
    pub(crate) approval_policy: AskForApproval,
    pub(crate) sandbox_policy: SandboxPolicy,
    pub(crate) shell_environment_policy: ShellEnvironmentPolicy,
+    pub(crate) non_login_shell_allowlist: Vec<String>,
+    pub(crate) non_login_shell_heuristic_enabled: bool,
    pub(crate) tools_config: ToolsConfig,
    pub(crate) final_output_json_schema: Option<Value>,
    pub(crate) codex_linux_sandbox_exe: Option<PathBuf>,
@@ -405,6 +407,7 @@ impl Session {
    ) -> TurnContext {
        let config = session_configuration.original_config_do_not_use.clone();
        let features = &config.features;
+        let non_login_shell_heuristic_enabled = features.enabled(Feature::NonLoginShellHeuristic);
        let model_family = models_manager.construct_model_family(&session_configuration.model);
        let mut per_turn_config = (*config).clone();
        per_turn_config.model = session_configuration.model.clone();
@@ -448,6 +451,8 @@ impl Session {
            approval_policy: session_configuration.approval_policy,
            sandbox_policy: session_configuration.sandbox_policy.clone(),
            shell_environment_policy: config.shell_environment_policy.clone(),
+            non_login_shell_allowlist: config.non_login_shell_allowlist.clone(),
+            non_login_shell_heuristic_enabled,
            tools_config,
            final_output_json_schema: None,
            codex_linux_sandbox_exe: config.codex_linux_sandbox_exe.clone(),
@@ -1888,6 +1893,8 @@ async fn spawn_review_thread(
        approval_policy: parent_turn_context.approval_policy,
        sandbox_policy: parent_turn_context.sandbox_policy.clone(),
        shell_environment_policy: parent_turn_context.shell_environment_policy.clone(),
+        non_login_shell_allowlist: parent_turn_context.non_login_shell_allowlist.clone(),
+        non_login_shell_heuristic_enabled: parent_turn_context.non_login_shell_heuristic_enabled,
        cwd: parent_turn_context.cwd.clone(),
        final_output_json_schema: None,
        codex_linux_sandbox_exe: parent_turn_context.codex_linux_sandbox_exe.clone(),
--- a/codex-rs/core/src/command_safety/is_safe_command.rs
+++ b/codex-rs/core/src/command_safety/is_safe_command.rs
@@ -1,6 +1,11 @@
 use crate::bash::parse_shell_lc_plain_commands;
 use crate::command_safety::windows_safe_commands::is_safe_command_windows;

+pub const DEFAULT_NON_LOGIN_SHELL_ALLOWLIST: &[&str] = &[
+    "cat", "cd", "echo", "false", "find", "git", "grep", "head", "ls", "nl", "pwd", "rg", "rm",
+    "rmdir", "sed", "tail", "true", "wc",
+];
+
 pub fn is_known_safe_command(command: &[String]) -> bool {
    let command: Vec<String> = command
        .iter()
@@ -43,6 +48,20 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
        return false;
    };

+    if cmd0 == "rm" {
+        return command
+            .iter()
+            .skip(1)
+            .all(|arg| !arg.starts_with('-') && !arg.starts_with("--"));
+    }
+
+    if cmd0 == "rmdir" {
+        return command
+            .iter()
+            .skip(1)
+            .all(|arg| !arg.starts_with('-') && !arg.starts_with("--"));
+    }
+
    match std::path::Path::new(&cmd0)
        .file_name()
        .and_then(|osstr| osstr.to_str())
@@ -60,8 +79,7 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
            "pwd" |
            "tail" |
            "true" |
-            "wc" |
-            "which") => {
+            "wc") => {
            true
        },

@@ -113,9 +131,6 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
            Some("branch" | "status" | "log" | "diff" | "show")
        ),

-        // Rust
-        Some("cargo") if command.get(1).map(String::as_str) == Some("check") => true,
-
        // Special-case `sed -n {N|M,N}p`
        Some("sed")
            if {
@@ -267,6 +282,17 @@ mod tests {
        }
    }

+    #[test]
+    fn rm_rules() {
+        assert!(is_safe_to_call_with_exec(&vec_str(&["rm", "file.txt"])));
+        assert!(!is_safe_to_call_with_exec(&vec_str(&["rm", "-rf", "/"])));
+
+        assert!(is_safe_to_call_with_exec(&vec_str(&["rmdir", "tmp_dir"])));
+        assert!(!is_safe_to_call_with_exec(&vec_str(&[
+            "rmdir", "-p", "tmp_dir"
+        ])));
+    }
+
    #[test]
    fn windows_powershell_full_path_is_safe() {
        if !cfg!(windows) {
@@ -313,6 +339,16 @@ mod tests {
        ])));
    }

+    #[test]
+    fn bash_c_safe_examples() {
+        assert!(is_known_safe_command(&vec_str(&["bash", "-c", "ls"])));
+        assert!(is_known_safe_command(&vec_str(&[
+            "bash",
+            "-c",
+            "git status"
+        ])));
+    }
+
    #[test]
    fn bash_lc_safe_examples_with_operators() {
        assert!(is_known_safe_command(&vec_str(&[
--- a/codex-rs/core/src/config/mod.rs
+++ b/codex-rs/core/src/config/mod.rs
@@ -48,12 +48,14 @@ use dunce::canonicalize;
 use serde::Deserialize;
 use similar::DiffableStr;
 use std::collections::BTreeMap;
+use std::collections::BTreeSet;
 use std::collections::HashMap;
 use std::io::ErrorKind;
 use std::path::Path;
 use std::path::PathBuf;

 use crate::config::profile::ConfigProfile;
+use crate::is_safe_command::DEFAULT_NON_LOGIN_SHELL_ALLOWLIST;
 use toml::Value as TomlValue;
 use toml_edit::DocumentMut;

@@ -109,6 +111,9 @@ pub struct Config {
    pub forced_auto_mode_downgraded_on_windows: bool,

    pub shell_environment_policy: ShellEnvironmentPolicy,
+    /// Commands considered safe to run without a login shell when the heuristic
+    /// feature flag is enabled.
+    pub non_login_shell_allowlist: Vec<String>,

    /// When `true`, `AgentReasoning` events emitted by the backend will be
    /// suppressed from the frontend output. This can reduce visual noise when
@@ -584,6 +589,10 @@ pub struct ConfigToml {
    #[serde(default)]
    pub shell_environment_policy: ShellEnvironmentPolicyToml,

+    /// Shell-specific configuration.
+    #[serde(default)]
+    pub shell: Option<crate::config::types::ShellConfigToml>,
+
    /// Sandbox mode to use.
    pub sandbox_mode: Option<SandboxMode>,

@@ -998,6 +1007,24 @@ impl Config {
            crate::safety::set_windows_sandbox_enabled(features.enabled(Feature::WindowsSandbox));
        }

+        let non_login_shell_allowlist: Vec<String> = {
+            let mut merged = BTreeSet::new();
+            for cmd in DEFAULT_NON_LOGIN_SHELL_ALLOWLIST {
+                merged.insert((*cmd).to_string());
+            }
+            if let Some(shell_cfg) = &cfg.shell
+                && let Some(user_allowlist) = shell_cfg.non_login_allowlist.as_ref()
+            {
+                for cmd in user_allowlist {
+                    let trimmed = cmd.trim();
+                    if !trimmed.is_empty() {
+                        merged.insert(trimmed.to_string());
+                    }
+                }
+            }
+            merged.into_iter().collect()
+        };
+
        let resolved_cwd = {
            use std::env;

@@ -1183,6 +1210,7 @@ impl Config {
            did_user_set_custom_approval_policy_or_sandbox_mode,
            forced_auto_mode_downgraded_on_windows,
            shell_environment_policy,
+            non_login_shell_allowlist,
            notify: cfg.notify,
            user_instructions,
            base_instructions,
@@ -2963,6 +2991,10 @@ model_verbosity = "high"
                did_user_set_custom_approval_policy_or_sandbox_mode: true,
                forced_auto_mode_downgraded_on_windows: false,
                shell_environment_policy: ShellEnvironmentPolicy::default(),
+                non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
+                    .iter()
+                    .map(std::string::ToString::to_string)
+                    .collect(),
                user_instructions: None,
                notify: None,
                cwd: fixture.cwd(),
@@ -3037,6 +3069,10 @@ model_verbosity = "high"
            did_user_set_custom_approval_policy_or_sandbox_mode: true,
            forced_auto_mode_downgraded_on_windows: false,
            shell_environment_policy: ShellEnvironmentPolicy::default(),
+            non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
+                .iter()
+                .map(std::string::ToString::to_string)
+                .collect(),
            user_instructions: None,
            notify: None,
            cwd: fixture.cwd(),
@@ -3126,6 +3162,10 @@ model_verbosity = "high"
            did_user_set_custom_approval_policy_or_sandbox_mode: true,
            forced_auto_mode_downgraded_on_windows: false,
            shell_environment_policy: ShellEnvironmentPolicy::default(),
+            non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
+                .iter()
+                .map(std::string::ToString::to_string)
+                .collect(),
            user_instructions: None,
            notify: None,
            cwd: fixture.cwd(),
@@ -3201,6 +3241,10 @@ model_verbosity = "high"
            did_user_set_custom_approval_policy_or_sandbox_mode: true,
            forced_auto_mode_downgraded_on_windows: false,
            shell_environment_policy: ShellEnvironmentPolicy::default(),
+            non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
+                .iter()
+                .map(std::string::ToString::to_string)
+                .collect(),
            user_instructions: None,
            notify: None,
            cwd: fixture.cwd(),
--- a/codex-rs/core/src/config/types.rs
+++ b/codex-rs/core/src/config/types.rs
@@ -459,6 +459,15 @@ pub struct ShellEnvironmentPolicyToml {
    pub experimental_use_profile: Option<bool>,
 }

+#[derive(Deserialize, Debug, Clone, Default, PartialEq)]
+#[serde(rename_all = "kebab-case")]
+pub struct ShellConfigToml {
+    /// Additional commands that can skip login shells when the
+    /// `non_login_shell_heuristic` feature flag is enabled.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub non_login_allowlist: Option<Vec<String>>,
+}
+
 pub type EnvironmentVariablePattern = WildMatchPattern<'*', '?'>;

 /// Deriving the `env` based on this policy works as follows:
--- a/codex-rs/core/src/features.rs
+++ b/codex-rs/core/src/features.rs
@@ -55,6 +55,8 @@ pub enum Feature {
    Skills,
    /// Send warnings to the model to correct it on the tool usage.
    ModelWarnings,
+    /// Skip login shell for known-safe commands when enabled.
+    NonLoginShellHeuristic,
 }

 impl Feature {
@@ -342,4 +344,10 @@ pub const FEATURES: &[FeatureSpec] = &[
        stage: Stage::Experimental,
        default_enabled: false,
    },
+    FeatureSpec {
+        id: Feature::NonLoginShellHeuristic,
+        key: "non_login_shell_heuristic",
+        stage: Stage::Experimental,
+        default_enabled: false,
+    },
 ];
--- a/codex-rs/core/src/shell.rs
+++ b/codex-rs/core/src/shell.rs
@@ -1,5 +1,8 @@
+use crate::is_safe_command::is_known_safe_command;
 use serde::Deserialize;
 use serde::Serialize;
+use shlex::split as shlex_split;
+use std::path::Path;
 use std::path::PathBuf;

 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
@@ -60,6 +63,51 @@ impl Shell {
    }
 }

+pub fn should_use_login_shell(
+    requested_login_shell: bool,
+    command: &str,
+    shell: &Shell,
+    heuristic_enabled: bool,
+    allowlist: &[String],
+) -> bool {
+    if !requested_login_shell {
+        return false;
+    }
+    if !heuristic_enabled {
+        return true;
+    }
+    if !matches!(
+        shell.shell_type,
+        ShellType::Bash | ShellType::Zsh | ShellType::Sh
+    ) {
+        return requested_login_shell;
+    }
+
+    let is_allowlisted = shlex_split(command)
+        .and_then(|tokens| {
+            tokens.first().map(|token| {
+                let token_name = Path::new(token)
+                    .file_name()
+                    .and_then(|name| name.to_str())
+                    .unwrap_or(token.as_str());
+                allowlist
+                    .iter()
+                    .any(|cmd| cmd == token || cmd == token_name)
+            })
+        })
+        .unwrap_or(false);
+    if !is_allowlisted {
+        return true;
+    }
+
+    let derived = shell.derive_exec_args(command, false);
+    if is_known_safe_command(&derived) {
+        return false;
+    }
+
+    true
+}
+
 #[cfg(unix)]
 fn get_user_shell_path() -> Option<PathBuf> {
    use libc::getpwuid;
@@ -359,7 +407,8 @@ mod tests {
        assert!(
            shell_path == PathBuf::from("/bin/bash")
                || shell_path == PathBuf::from("/usr/bin/bash")
-                || shell_path == PathBuf::from("/usr/local/bin/bash"),
+                || shell_path == PathBuf::from("/usr/local/bin/bash")
+                || shell_path == PathBuf::from("/opt/homebrew/bin/bash"),
            "shell path: {shell_path:?}",
        );
    }
@@ -451,4 +500,23 @@ mod tests {

        assert!(shell_path.ends_with("pwsh.exe") || shell_path.ends_with("powershell.exe"));
    }
+
+    #[test]
+    fn non_login_shell_heuristic_respects_allowlist() {
+        let shell = Shell {
+            shell_type: ShellType::Bash,
+            shell_path: PathBuf::from("/bin/bash"),
+        };
+        let allowlist = vec!["ls".to_string()];
+        assert!(!should_use_login_shell(
+            true, "ls -la", &shell, true, &allowlist
+        ));
+        assert!(should_use_login_shell(
+            true,
+            "awk '{print $1}' file.txt",
+            &shell,
+            true,
+            &allowlist
+        ));
+    }
 }
--- a/codex-rs/core/src/tasks/user_shell.rs
+++ b/codex-rs/core/src/tasks/user_shell.rs
@@ -24,6 +24,7 @@ use crate::protocol::ExecCommandSource;
 use crate::protocol::SandboxPolicy;
 use crate::protocol::TaskStartedEvent;
 use crate::sandboxing::ExecEnv;
+use crate::shell::should_use_login_shell;
 use crate::state::TaskKind;
 use crate::tools::format_exec_output_str;
 use crate::user_shell_command::user_shell_command_record_item;
@@ -66,7 +67,13 @@ impl SessionTask for UserShellCommandTask {
        // Execute the user's script under their default shell when known; this
        // allows commands that use shell features (pipes, &&, redirects, etc.).
        // We do not source rc files or otherwise reformat the script.
-        let use_login_shell = true;
+        let use_login_shell = should_use_login_shell(
+            true,
+            &self.command,
+            session.user_shell(),
+            turn_context.non_login_shell_heuristic_enabled,
+            &turn_context.non_login_shell_allowlist,
+        );
        let command = session
            .user_shell()
            .derive_exec_args(&self.command, use_login_shell);
--- a/codex-rs/core/src/tools/handlers/shell.rs
+++ b/codex-rs/core/src/tools/handlers/shell.rs
@@ -11,6 +11,7 @@ use crate::function_tool::FunctionCallError;
 use crate::is_safe_command::is_known_safe_command;
 use crate::protocol::ExecCommandSource;
 use crate::sandboxing::SandboxPermissions;
+use crate::shell::should_use_login_shell;
 use crate::tools::context::ToolInvocation;
 use crate::tools::context::ToolOutput;
 use crate::tools::context::ToolPayload;
@@ -49,7 +50,13 @@ impl ShellCommandHandler {
        turn_context: &TurnContext,
    ) -> ExecParams {
        let shell = session.user_shell();
-        let use_login_shell = true;
+        let use_login_shell = should_use_login_shell(
+            true,
+            &params.command,
+            shell,
+            turn_context.non_login_shell_heuristic_enabled,
+            &turn_context.non_login_shell_allowlist,
+        );
        let command = shell.derive_exec_args(&params.command, use_login_shell);

        ExecParams {
--- a/codex-rs/core/src/tools/handlers/unified_exec.rs
+++ b/codex-rs/core/src/tools/handlers/unified_exec.rs
@@ -8,6 +8,7 @@ use crate::protocol::ExecCommandSource;
 use crate::protocol::ExecOutputStream;
 use crate::shell::default_user_shell;
 use crate::shell::get_shell_by_model_provided_path;
+use crate::shell::should_use_login_shell;
 use crate::tools::context::ToolInvocation;
 use crate::tools::context::ToolOutput;
 use crate::tools::context::ToolPayload;
@@ -93,7 +94,15 @@ impl ToolHandler for UnifiedExecHandler {
        let Ok(params) = serde_json::from_str::<ExecCommandArgs>(arguments) else {
            return true;
        };
-        let command = get_command(&params);
+        let shell = resolve_shell(&params);
+        let use_login_shell = should_use_login_shell(
+            params.login,
+            &params.cmd,
+            &shell,
+            invocation.turn.non_login_shell_heuristic_enabled,
+            &invocation.turn.non_login_shell_allowlist,
+        );
+        let command = shell.derive_exec_args(&params.cmd, use_login_shell);
        !is_known_safe_command(&command)
    }

@@ -130,7 +139,15 @@ impl ToolHandler for UnifiedExecHandler {
                })?;
                let process_id = manager.allocate_process_id().await;

-                let command = get_command(&args);
+                let shell = resolve_shell(&args);
+                let use_login_shell = should_use_login_shell(
+                    args.login,
+                    &args.cmd,
+                    &shell,
+                    turn.non_login_shell_heuristic_enabled,
+                    &turn.non_login_shell_allowlist,
+                );
+                let command = shell.derive_exec_args(&args.cmd, use_login_shell);
                let ExecCommandArgs {
                    workdir,
                    yield_time_ms,
@@ -253,14 +270,18 @@ impl ToolHandler for UnifiedExecHandler {
    }
 }

+#[cfg(test)]
 fn get_command(args: &ExecCommandArgs) -> Vec<String> {
-    let shell = if let Some(shell_str) = &args.shell {
+    let shell = resolve_shell(args);
+    shell.derive_exec_args(&args.cmd, args.login)
+}
+
+fn resolve_shell(args: &ExecCommandArgs) -> crate::shell::Shell {
+    if let Some(shell_str) = &args.shell {
        get_shell_by_model_provided_path(&PathBuf::from(shell_str))
    } else {
        default_user_shell()
-    };
-
-    shell.derive_exec_args(&args.cmd, args.login)
+    }
 }

 fn format_response(response: &UnifiedExecResponse) -> String {
--- a/codex-rs/scripts/bench_codex_exec_non_login.py
+++ b/codex-rs/scripts/bench_codex_exec_non_login.py
@@ -0,0 +1,615 @@
+#!/usr/bin/env python3
+"""
+Run codex exec end-to-end with and without the non_login_shell_heuristic feature.
+
+This script repeatedly runs:
+- codex exec with the heuristic disabled
+- codex exec with the heuristic enabled
+
+for one or more prompts and reports per-run wall times, per-prompt summaries,
+and combined summary statistics.
+
+Runs can be parallelized (concurrency > 1) to speed up sampling, but note that
+parallel rollouts may contend for local/remote resources and slightly skew
+latency compared to strictly serial runs.
+
+Usage:
+  python scripts/bench_codex_exec_non_login.py \
+    --iterations 5 \
+    --prompt "Read and summarize codex_berry" \
+    --prompt "Explain the architecture" \
+    --workdir /path/to/repo \
+    --model gpt-5.1-codex-max \
+    --reasoning-effort high \
+    --concurrency 2 \
+    --codex-bin /path/to/codex \
+    --skip-feature-toggle  # when benchmarking an older codex binary
+
+Notes:
+- Runs will incur network/LLM variance; prefer N >= 5–10.
+- Requires the `codex` binary (local path supported) and valid credentials.
+- Results are printed in seconds; failures are logged and included in the output.
+"""
+
+import argparse
+import asyncio
+import json
+import math
+import statistics
+import sys
+import time
+from dataclasses import dataclass
+from typing import List
+from typing import Optional
+
+
+@dataclass
+class RunResult:
+    duration: float
+    exit_code: int
+    stderr: str
+    command_time: float
+    command_count: int
+    mcp_call_count: int
+    usage: Optional["Usage"]
+
+
+@dataclass
+class Usage:
+    input_tokens: int
+    cached_input_tokens: int
+    output_tokens: int
+
+
+async def run_once(
+    base_cmd: List[str],
+    prompt: str,
+    feature_enabled: bool,
+    toggle_feature: bool,
+    sem: asyncio.Semaphore,
+) -> RunResult:
+    cmd = list(base_cmd)
+    if toggle_feature:
+        cmd.extend(
+            [
+                "-c",
+                f"features.non_login_shell_heuristic={'true' if feature_enabled else 'false'}",
+            ]
+        )
+    cmd.append(prompt)
+    async with sem:
+        start = time.perf_counter()
+        proc = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+
+        async def read_stdout() -> tuple[float, int, int, Optional[Usage]]:
+            command_start_times: dict[str, float] = {}
+            command_time = 0.0
+            command_count = 0
+            mcp_call_count = 0
+            usage: Optional[Usage] = None
+            buffer = b""
+
+            def handle_line(raw_line: bytes) -> None:
+                nonlocal command_time, command_count, mcp_call_count, usage
+                ts = time.perf_counter()
+                line = raw_line.decode(errors="replace").strip()
+                if not line:
+                    return
+                try:
+                    event = json.loads(line)
+                except json.JSONDecodeError:
+                    return
+
+                event_type = event.get("type")
+                if event_type == "turn.completed" and (usage_info := event.get("usage")):
+                    usage = Usage(
+                        input_tokens=usage_info.get("input_tokens", 0),
+                        cached_input_tokens=usage_info.get("cached_input_tokens", 0),
+                        output_tokens=usage_info.get("output_tokens", 0),
+                    )
+
+                if not event_type or not event_type.startswith("item."):
+                    return
+
+                item = event.get("item") or {}
+                item_type = item.get("type")
+                item_id = item.get("id")
+
+                if item_type == "command_execution":
+                    if event_type == "item.started" and item_id:
+                        command_start_times[item_id] = ts
+                        command_count += 1
+                    elif event_type == "item.completed" and item_id:
+                        if start_ts := command_start_times.pop(item_id, None):
+                            command_time += ts - start_ts
+                elif item_type == "mcp_tool_call":
+                    if event_type == "item.started":
+                        mcp_call_count += 1
+
+            assert proc.stdout is not None
+            try:
+                while True:
+                    chunk = await proc.stdout.read(4096)
+                    if not chunk:
+                        break
+                    buffer += chunk
+                    lines = buffer.split(b"\n")
+                    buffer = lines.pop() if lines else b""
+                    for raw_line in lines:
+                        handle_line(raw_line)
+            except asyncio.CancelledError:
+                # Allow cancellation to propagate but keep what we have so far.
+                raise
+            finally:
+                if buffer:
+                    handle_line(buffer)
+
+            return command_time, command_count, mcp_call_count, usage
+
+        stdout_task = asyncio.create_task(read_stdout())
+        stderr_task = (
+            asyncio.create_task(proc.stderr.read()) if proc.stderr is not None else None
+        )
+
+        await proc.wait()
+        stderr_bytes = await stderr_task if stderr_task is not None else b""
+        command_time, command_count, mcp_call_count, usage = await stdout_task
+
+        duration = time.perf_counter() - start
+        stderr_text = stderr_bytes.decode(errors="replace").strip() if stderr_bytes else ""
+        return RunResult(
+            duration=duration,
+            exit_code=proc.returncode,
+            stderr=stderr_text,
+            command_time=command_time,
+            command_count=command_count,
+            mcp_call_count=mcp_call_count,
+            usage=usage,
+        )
+
+
+async def run_case(
+    label: str,
+    base_cmd: List[str],
+    prompt: str,
+    iterations: int,
+    feature_enabled: bool,
+    toggle_feature: bool,
+    concurrency: int,
+) -> tuple[
+    list[float], list[float], list[float], list[int], list[int], list[Usage], int
+]:
+    sem = asyncio.Semaphore(concurrency)
+    tasks = [
+        asyncio.create_task(
+            run_once(base_cmd, prompt, feature_enabled, toggle_feature, sem)
+        )
+        for _ in range(iterations)
+    ]
+    durations: List[float] = []
+    command_times: List[float] = []
+    command_times_per_cmd: List[float] = []
+    command_counts: List[int] = []
+    mcp_call_counts: List[int] = []
+    usages: List[Usage] = []
+    failures = 0
+    for idx, task in enumerate(asyncio.as_completed(tasks), start=1):
+        result = await task
+        durations.append(result.duration)
+        command_times.append(result.command_time)
+        if result.command_count > 0:
+            command_times_per_cmd.append(result.command_time / result.command_count)
+        command_counts.append(result.command_count)
+        mcp_call_counts.append(result.mcp_call_count)
+        if result.usage:
+            usages.append(result.usage)
+        status = "ok" if result.exit_code == 0 else f"fail ({result.exit_code})"
+        cmd_time_str = f" cmd_time={result.command_time:.3f}s cmds={result.command_count}"
+        mcp_str = f" mcp_calls={result.mcp_call_count}" if result.mcp_call_count else ""
+        print(
+            f"[{label}] run {idx}/{iterations}: {result.duration:.3f}s"
+            f" [{status}]{cmd_time_str}{mcp_str}"
+        )
+        if result.exit_code != 0:
+            failures += 1
+            if result.stderr:
+                print(f"    stderr: {result.stderr}", file=sys.stderr)
+    return (
+        durations,
+        command_times,
+        command_times_per_cmd,
+        command_counts,
+        mcp_call_counts,
+        usages,
+        failures,
+    )
+
+
+def summarize(label: str, durations: List[float]) -> None:
+    if not durations:
+        print(f"[{label}] no runs recorded")
+        return
+    mean = statistics.mean(durations)
+    median = statistics.median(durations)
+    if len(durations) < 2:
+        p95 = durations[0]
+    else:
+        p95 = statistics.quantiles(durations, n=100)[94]
+    margin = confidence_margin(durations)
+    print(
+        f"[{label}] n={len(durations)} "
+        f"mean={mean:.3f}s±{margin:.3f}s median={median:.3f}s p95={p95:.3f}s"
+    )
+
+
+def summarize_command_time(label: str, command_times: List[float], command_counts: List[int]) -> None:
+    if not command_times:
+        print(f"[{label}] command time: no runs recorded")
+        return
+    mean = statistics.mean(command_times)
+    median = statistics.median(command_times)
+    p95 = command_times[0] if len(command_times) < 2 else statistics.quantiles(command_times, n=100)[94]
+    avg_cmds = statistics.mean(command_counts) if command_counts else 0
+    margin = confidence_margin(command_times)
+    print(
+        f"[{label}] command time n={len(command_times)} "
+        f"mean={mean:.3f}s±{margin:.3f}s median={median:.3f}s p95={p95:.3f}s avg_cmds={avg_cmds:.2f}"
+    )
+
+
+def summarize_command_time_per_command(label: str, command_times_per_cmd: List[float]) -> None:
+    if not command_times_per_cmd:
+        print(f"[{label}] command time per command: no runs recorded")
+        return
+    mean = statistics.mean(command_times_per_cmd)
+    median = statistics.median(command_times_per_cmd)
+    p95 = (
+        command_times_per_cmd[0]
+        if len(command_times_per_cmd) < 2
+        else statistics.quantiles(command_times_per_cmd, n=100)[94]
+    )
+    margin = confidence_margin(command_times_per_cmd)
+    print(
+        f"[{label}] command time per command n={len(command_times_per_cmd)} "
+        f"mean={mean:.3f}s±{margin:.3f}s median={median:.3f}s p95={p95:.3f}s"
+    )
+
+
+def summarize_usage(label: str, usages: List[Usage]) -> None:
+    if not usages:
+        print(f"[{label}] tokens: no runs recorded")
+        return
+    avg_input = statistics.mean(u.input_tokens for u in usages)
+    avg_cached = statistics.mean(u.cached_input_tokens for u in usages)
+    avg_output = statistics.mean(u.output_tokens for u in usages)
+    print(
+        f"[{label}] tokens avg input={avg_input:.1f} cached={avg_cached:.1f} output={avg_output:.1f}"
+    )
+
+
+def summarize_mcp_calls(label: str, mcp_counts: List[int]) -> None:
+    if not mcp_counts:
+        print(f"[{label}] mcp calls: no runs recorded")
+        return
+    avg_mcp = statistics.mean(mcp_counts)
+    max_mcp = max(mcp_counts)
+    print(f"[{label}] mcp calls avg={avg_mcp:.2f} max={max_mcp}")
+
+
+def format_prompt_label(prompt: str, idx: int) -> str:
+    snippet = prompt.strip()
+    if len(snippet) > 60:
+        snippet = snippet[:57] + "..."
+    return f"prompt {idx + 1}: {snippet}"
+
+
+def confidence_margin(values: List[float]) -> float:
+    if len(values) < 2:
+        return 0.0
+    stdev = statistics.stdev(values)
+    return 1.96 * stdev / math.sqrt(len(values))
+
+
+def stats(values: List[float]) -> dict:
+    if not values:
+        return {
+            "n": 0,
+            "mean": None,
+            "median": None,
+            "p95": None,
+            "margin": None,
+        }
+    mean = statistics.mean(values)
+    median = statistics.median(values)
+    if len(values) < 2:
+        p95 = values[0]
+    else:
+        p95 = statistics.quantiles(values, n=100)[94]
+    margin = confidence_margin(values)
+    return {"n": len(values), "mean": mean, "median": median, "p95": p95, "margin": margin}
+
+
+def safe_avg(values: List[float]) -> Optional[float]:
+    return statistics.mean(values) if values else None
+
+
+def fmt_stat(s: dict) -> str:
+    if not s["n"]:
+        return "-"
+    return f"{s['mean']:.3f}±{s['margin']:.3f}"
+
+
+def fmt_avg(value: Optional[float]) -> str:
+    return f"{value:.2f}" if value is not None else "-"
+
+
+def fmt_int(value: Optional[int]) -> str:
+    return str(value) if value is not None else "-"
+
+
+def render_table(headers: List[str], rows: List[List[str]]) -> None:
+    widths = [len(h) for h in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            widths[i] = max(widths[i], len(cell))
+    fmt = "  ".join(f"{{:<{w}}}" for w in widths)
+    print(fmt.format(*headers))
+    print(fmt.format(*["-" * w for w in widths]))
+    for row in rows:
+        print(fmt.format(*row))
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Benchmark codex exec with/without non_login_shell_heuristic."
+    )
+    parser.add_argument(
+        "--codex-bin",
+        default="codex",
+        help="Path to the codex binary (default: codex on PATH).",
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=5,
+        help="Number of runs per configuration (default: 5)",
+    )
+    parser.add_argument(
+        "--prompt",
+        dest="prompts",
+        action="append",
+        required=True,
+        help="Prompt to send to codex exec (repeat for multiple prompts).",
+    )
+    parser.add_argument(
+        "--workdir",
+        default=None,
+        help="Working directory for codex exec (--cd); defaults to current dir.",
+    )
+    parser.add_argument(
+        "--model",
+        default=None,
+        help="Optional model override passed to codex exec (--model).",
+    )
+    parser.add_argument(
+        "--reasoning-effort",
+        choices=["none", "minimal", "low", "medium", "high", "xhigh"],
+        default=None,
+        help="Optional reasoning effort override passed via -c model_reasoning_effort=...",
+    )
+    parser.add_argument(
+        "--extra-config",
+        action="append",
+        default=[],
+        help="Additional -c overrides passed to codex exec (repeatable).",
+    )
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=1,
+        help="Number of concurrent runs to launch per configuration (default: 1).",
+    )
+    parser.add_argument(
+        "--skip-feature-toggle",
+        action="store_true",
+        help=(
+            "Do not inject the non_login_shell_heuristic flag; "
+            "use when benchmarking an older codex binary."
+        ),
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print codex exec stdout/stderr for debugging.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    base_cmd: List[str] = [args.codex_bin, "exec"]
+    if args.workdir:
+        base_cmd.extend(["--cd", args.workdir])
+    if args.model:
+        base_cmd.extend(["--model", args.model])
+    if args.reasoning_effort:
+        base_cmd.extend(["-c", f"model_reasoning_effort={args.reasoning_effort}"])
+    for override in args.extra_config:
+        base_cmd.extend(["-c", override])
+    base_cmd.append("--json")
+    toggle_feature = not args.skip_feature_toggle
+    prompts = args.prompts
+
+    print(
+        f"Running {args.iterations} iterations per config for {len(prompts)} prompt(s)..."
+    )
+    all_off_durations: List[float] = []
+    all_on_durations: List[float] = []
+    all_off_command_times: List[float] = []
+    all_on_command_times: List[float] = []
+    all_off_command_times_per_cmd: List[float] = []
+    all_on_command_times_per_cmd: List[float] = []
+    all_off_command_counts: List[int] = []
+    all_on_command_counts: List[int] = []
+    all_off_mcp_calls: List[int] = []
+    all_on_mcp_calls: List[int] = []
+    all_off_usages: List[Usage] = []
+    all_on_usages: List[Usage] = []
+    total_off_failures = 0
+    total_on_failures = 0
+    per_prompt_results: list[dict] = []
+
+    for idx, prompt in enumerate(prompts):
+        label = format_prompt_label(prompt, idx)
+        print(f"\nPrompt {idx + 1}/{len(prompts)}: {label}")
+        (
+            off_durations,
+            off_cmd_times,
+            off_cmd_times_per_cmd,
+            off_cmd_counts,
+            off_mcp_calls,
+            off_usages,
+            off_failures,
+        ) = asyncio.run(
+            run_case(
+                f"login-shell | {label}",
+                base_cmd,
+                prompt,
+                args.iterations,
+                feature_enabled=False,
+                toggle_feature=toggle_feature,
+                concurrency=args.concurrency,
+            )
+        )
+        (
+            on_durations,
+            on_cmd_times,
+            on_cmd_times_per_cmd,
+            on_cmd_counts,
+            on_mcp_calls,
+            on_usages,
+            on_failures,
+        ) = asyncio.run(
+            run_case(
+                f"non-login-shell | {label}",
+                base_cmd,
+                prompt,
+                args.iterations,
+                feature_enabled=True,
+                toggle_feature=toggle_feature,
+                concurrency=args.concurrency,
+            )
+        )
+        per_prompt_results.append(
+            {
+                "label": label,
+                "off_durations": off_durations,
+                "on_durations": on_durations,
+                "off_cmd_times": off_cmd_times,
+                "on_cmd_times": on_cmd_times,
+                "off_cmd_times_per_cmd": off_cmd_times_per_cmd,
+                "on_cmd_times_per_cmd": on_cmd_times_per_cmd,
+                "off_cmd_counts": off_cmd_counts,
+                "on_cmd_counts": on_cmd_counts,
+                "off_mcp_calls": off_mcp_calls,
+                "on_mcp_calls": on_mcp_calls,
+                "off_usages": off_usages,
+                "on_usages": on_usages,
+                "off_failures": off_failures,
+                "on_failures": on_failures,
+            }
+        )
+        all_off_durations.extend(off_durations)
+        all_on_durations.extend(on_durations)
+        all_off_command_times.extend(off_cmd_times)
+        all_on_command_times.extend(on_cmd_times)
+        all_off_command_times_per_cmd.extend(off_cmd_times_per_cmd)
+        all_on_command_times_per_cmd.extend(on_cmd_times_per_cmd)
+        all_off_command_counts.extend(off_cmd_counts)
+        all_on_command_counts.extend(on_cmd_counts)
+        all_off_mcp_calls.extend(off_mcp_calls)
+        all_on_mcp_calls.extend(on_mcp_calls)
+        all_off_usages.extend(off_usages)
+        all_on_usages.extend(on_usages)
+        total_off_failures += off_failures
+        total_on_failures += on_failures
+
+    print("\nPer-prompt summary (means ±95% CI):")
+    per_prompt_rows: List[List[str]] = []
+    for s in per_prompt_results:
+        per_prompt_rows.append(
+            [
+                s["label"],
+                fmt_stat(stats(s["off_durations"])),
+                fmt_stat(stats(s["on_durations"])),
+                fmt_stat(stats(s["off_cmd_times"])),
+                fmt_stat(stats(s["on_cmd_times"])),
+                fmt_stat(stats(s["off_cmd_times_per_cmd"])),
+                fmt_stat(stats(s["on_cmd_times_per_cmd"])),
+                fmt_avg(safe_avg(s["off_cmd_counts"])),
+                fmt_avg(safe_avg(s["on_cmd_counts"])),
+                fmt_avg(safe_avg(s["off_mcp_calls"])),
+                fmt_avg(safe_avg(s["on_mcp_calls"])),
+            ]
+        )
+
+    render_table(
+        [
+            "prompt",
+            "wall login",
+            "wall non",
+            "cmd login",
+            "cmd non",
+            "cmd/call login",
+            "cmd/call non",
+            "avg cmds login",
+            "avg cmds non",
+            "avg mcp login",
+            "avg mcp non",
+        ],
+        per_prompt_rows,
+    )
+
+    print("\nCombined summary (means ±95% CI):")
+    combined_rows = [
+        [
+            "all prompts",
+            fmt_stat(stats(all_off_durations)),
+            fmt_stat(stats(all_on_durations)),
+            fmt_stat(stats(all_off_command_times)),
+            fmt_stat(stats(all_on_command_times)),
+            fmt_stat(stats(all_off_command_times_per_cmd)),
+            fmt_stat(stats(all_on_command_times_per_cmd)),
+            fmt_avg(safe_avg(all_off_command_counts)),
+            fmt_avg(safe_avg(all_on_command_counts)),
+            fmt_avg(safe_avg(all_off_mcp_calls)),
+            fmt_avg(safe_avg(all_on_mcp_calls)),
+        ]
+    ]
+    render_table(
+        [
+            "scope",
+            "wall login",
+            "wall non",
+            "cmd login",
+            "cmd non",
+            "cmd/call login",
+            "cmd/call non",
+            "avg cmds login",
+            "avg cmds non",
+            "avg mcp login",
+            "avg mcp non",
+        ],
+        combined_rows,
+    )
+
+    print(f"\nlogin-shell failures (all prompts): {total_off_failures}")
+    print(f"non-login-shell failures (all prompts): {total_on_failures}")
+
+    return 0 if (total_off_failures + total_on_failures) == 0 else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/docs/config.md
+++ b/docs/config.md
@@ -48,6 +48,7 @@ Supported features:
 | `web_search_request`                      |  false  | Stable       | Allow the model to issue web searches                |
 | `experimental_sandbox_command_assessment` |  false  | Experimental | Enable model-based sandbox risk assessment           |
 | `ghost_commit`                            |  false  | Experimental | Create a ghost commit each turn                      |
+| `non_login_shell_heuristic`               |  false  | Experimental | Skip login shells for allow-listed safe commands     |
 | `enable_experimental_windows_sandbox`     |  false  | Experimental | Use the Windows restricted-token sandbox             |

 Notes:
@@ -416,6 +417,28 @@ set = { PATH = "/usr/bin", MY_FLAG = "1" }

 Currently, `CODEX_SANDBOX_NETWORK_DISABLED=1` is also added to the environment, assuming network is disabled. This is not configurable.

+### shell.non_login_allowlist
+
+When the `non_login_shell_heuristic` feature flag is enabled, Codex will skip `-l`
+for obviously safe commands (e.g., `ls`, `rg --files`) to avoid loading login
+profiles on every exec. You can extend the allow-list via `config.toml`:
+
+```toml
+[shell]
+non_login_allowlist = ["ls", "rg", "git", "cat", "rm", "rmdir"]
+```
+
+The defaults include common read-only commands (`ls`, `cat`, `rg`, `git status`,
+`sed -n`, `rm <file>`, `rmdir <dir>`, etc.); entries you add are merged on top.
+The heuristic only applies when the feature flag is enabled, and it keeps using
+login shells for commands that are not known-safe.
+
+You can compare login vs non-login startup costs locally via:
+
+```bash
+cargo run -p codex-core --example non_login_shell_bench -- --iterations 5
+```
+
 ## MCP integration

 ### mcp_servers
Author	SHA1	Message	Date
Albin Cassirer	9089c1aaec	Make benchmarks focused on the time spent on commands.	2025-12-04 15:02:02 +07:00
Albin Cassirer	6a88b0f465	Something that seems to work at least.	2025-12-04 14:07:15 +07:00