Compare commits

...

2 Commits

Author SHA1 Message Date
Albin Cassirer
9089c1aaec Make benchmarks focused on the time spent on commands. 2025-12-04 15:02:02 +07:00
Albin Cassirer
6a88b0f465 Something that seems to work at least. 2025-12-04 14:07:15 +07:00
12 changed files with 939 additions and 14 deletions

View File

@@ -0,0 +1,80 @@
use std::collections::HashMap;
use std::time::Duration;
use std::time::Instant;
use codex_core::exec::DEFAULT_EXEC_COMMAND_TIMEOUT_MS;
use codex_core::exec::ExecExpiration;
use codex_core::exec::ExecParams;
use codex_core::exec::process_exec_tool_call;
use codex_core::protocol::SandboxPolicy;
use codex_core::shell::default_user_shell;
fn parse_iterations() -> usize {
let mut args = std::env::args().skip(1);
let mut iterations = 5usize;
while let Some(arg) = args.next() {
if arg == "--iterations" {
if let Some(value) = args.next() {
if let Ok(parsed) = value.parse::<usize>() {
iterations = parsed.max(1);
}
}
}
}
iterations
}
async fn measure(command: &str, use_login_shell: bool) -> anyhow::Result<Duration> {
let cwd = std::env::current_dir()?;
let env: HashMap<String, String> = std::env::vars().collect();
let shell = default_user_shell();
let args = shell.derive_exec_args(command, use_login_shell);
let params = ExecParams {
command: args,
cwd,
expiration: ExecExpiration::from(DEFAULT_EXEC_COMMAND_TIMEOUT_MS),
env,
with_escalated_permissions: None,
justification: None,
arg0: None,
};
let start = Instant::now();
let _ = process_exec_tool_call(
params,
&SandboxPolicy::DangerFullAccess,
&std::env::current_dir()?,
&None,
None,
)
.await?;
Ok(start.elapsed())
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let iterations = parse_iterations();
let commands = ["ls", "rg --files", "git status"];
println!("Non-login shell heuristic benchmark (iterations: {iterations})");
for command in commands {
let mut login_times = Vec::with_capacity(iterations);
let mut non_login_times = Vec::with_capacity(iterations);
for _ in 0..iterations {
login_times.push(measure(command, true).await?);
non_login_times.push(measure(command, false).await?);
}
let avg_login_ms =
login_times.iter().map(Duration::as_secs_f64).sum::<f64>() * 1000.0 / iterations as f64;
let avg_non_login_ms = non_login_times
.iter()
.map(Duration::as_secs_f64)
.sum::<f64>()
* 1000.0
/ iterations as f64;
println!(
"{command:12} login: {avg_login_ms:>8.3} ms | non-login: {avg_non_login_ms:>8.3} ms"
);
}
Ok(())
}

View File

@@ -289,6 +289,8 @@ pub(crate) struct TurnContext {
pub(crate) approval_policy: AskForApproval,
pub(crate) sandbox_policy: SandboxPolicy,
pub(crate) shell_environment_policy: ShellEnvironmentPolicy,
pub(crate) non_login_shell_allowlist: Vec<String>,
pub(crate) non_login_shell_heuristic_enabled: bool,
pub(crate) tools_config: ToolsConfig,
pub(crate) final_output_json_schema: Option<Value>,
pub(crate) codex_linux_sandbox_exe: Option<PathBuf>,
@@ -405,6 +407,7 @@ impl Session {
) -> TurnContext {
let config = session_configuration.original_config_do_not_use.clone();
let features = &config.features;
let non_login_shell_heuristic_enabled = features.enabled(Feature::NonLoginShellHeuristic);
let model_family = models_manager.construct_model_family(&session_configuration.model);
let mut per_turn_config = (*config).clone();
per_turn_config.model = session_configuration.model.clone();
@@ -448,6 +451,8 @@ impl Session {
approval_policy: session_configuration.approval_policy,
sandbox_policy: session_configuration.sandbox_policy.clone(),
shell_environment_policy: config.shell_environment_policy.clone(),
non_login_shell_allowlist: config.non_login_shell_allowlist.clone(),
non_login_shell_heuristic_enabled,
tools_config,
final_output_json_schema: None,
codex_linux_sandbox_exe: config.codex_linux_sandbox_exe.clone(),
@@ -1888,6 +1893,8 @@ async fn spawn_review_thread(
approval_policy: parent_turn_context.approval_policy,
sandbox_policy: parent_turn_context.sandbox_policy.clone(),
shell_environment_policy: parent_turn_context.shell_environment_policy.clone(),
non_login_shell_allowlist: parent_turn_context.non_login_shell_allowlist.clone(),
non_login_shell_heuristic_enabled: parent_turn_context.non_login_shell_heuristic_enabled,
cwd: parent_turn_context.cwd.clone(),
final_output_json_schema: None,
codex_linux_sandbox_exe: parent_turn_context.codex_linux_sandbox_exe.clone(),

View File

@@ -1,6 +1,11 @@
use crate::bash::parse_shell_lc_plain_commands;
use crate::command_safety::windows_safe_commands::is_safe_command_windows;
pub const DEFAULT_NON_LOGIN_SHELL_ALLOWLIST: &[&str] = &[
"cat", "cd", "echo", "false", "find", "git", "grep", "head", "ls", "nl", "pwd", "rg", "rm",
"rmdir", "sed", "tail", "true", "wc",
];
pub fn is_known_safe_command(command: &[String]) -> bool {
let command: Vec<String> = command
.iter()
@@ -43,6 +48,20 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
return false;
};
if cmd0 == "rm" {
return command
.iter()
.skip(1)
.all(|arg| !arg.starts_with('-') && !arg.starts_with("--"));
}
if cmd0 == "rmdir" {
return command
.iter()
.skip(1)
.all(|arg| !arg.starts_with('-') && !arg.starts_with("--"));
}
match std::path::Path::new(&cmd0)
.file_name()
.and_then(|osstr| osstr.to_str())
@@ -60,8 +79,7 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
"pwd" |
"tail" |
"true" |
"wc" |
"which") => {
"wc") => {
true
},
@@ -113,9 +131,6 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
Some("branch" | "status" | "log" | "diff" | "show")
),
// Rust
Some("cargo") if command.get(1).map(String::as_str) == Some("check") => true,
// Special-case `sed -n {N|M,N}p`
Some("sed")
if {
@@ -267,6 +282,17 @@ mod tests {
}
}
#[test]
fn rm_rules() {
assert!(is_safe_to_call_with_exec(&vec_str(&["rm", "file.txt"])));
assert!(!is_safe_to_call_with_exec(&vec_str(&["rm", "-rf", "/"])));
assert!(is_safe_to_call_with_exec(&vec_str(&["rmdir", "tmp_dir"])));
assert!(!is_safe_to_call_with_exec(&vec_str(&[
"rmdir", "-p", "tmp_dir"
])));
}
#[test]
fn windows_powershell_full_path_is_safe() {
if !cfg!(windows) {
@@ -313,6 +339,16 @@ mod tests {
])));
}
#[test]
fn bash_c_safe_examples() {
assert!(is_known_safe_command(&vec_str(&["bash", "-c", "ls"])));
assert!(is_known_safe_command(&vec_str(&[
"bash",
"-c",
"git status"
])));
}
#[test]
fn bash_lc_safe_examples_with_operators() {
assert!(is_known_safe_command(&vec_str(&[

View File

@@ -48,12 +48,14 @@ use dunce::canonicalize;
use serde::Deserialize;
use similar::DiffableStr;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::io::ErrorKind;
use std::path::Path;
use std::path::PathBuf;
use crate::config::profile::ConfigProfile;
use crate::is_safe_command::DEFAULT_NON_LOGIN_SHELL_ALLOWLIST;
use toml::Value as TomlValue;
use toml_edit::DocumentMut;
@@ -109,6 +111,9 @@ pub struct Config {
pub forced_auto_mode_downgraded_on_windows: bool,
pub shell_environment_policy: ShellEnvironmentPolicy,
/// Commands considered safe to run without a login shell when the heuristic
/// feature flag is enabled.
pub non_login_shell_allowlist: Vec<String>,
/// When `true`, `AgentReasoning` events emitted by the backend will be
/// suppressed from the frontend output. This can reduce visual noise when
@@ -584,6 +589,10 @@ pub struct ConfigToml {
#[serde(default)]
pub shell_environment_policy: ShellEnvironmentPolicyToml,
/// Shell-specific configuration.
#[serde(default)]
pub shell: Option<crate::config::types::ShellConfigToml>,
/// Sandbox mode to use.
pub sandbox_mode: Option<SandboxMode>,
@@ -998,6 +1007,24 @@ impl Config {
crate::safety::set_windows_sandbox_enabled(features.enabled(Feature::WindowsSandbox));
}
let non_login_shell_allowlist: Vec<String> = {
let mut merged = BTreeSet::new();
for cmd in DEFAULT_NON_LOGIN_SHELL_ALLOWLIST {
merged.insert((*cmd).to_string());
}
if let Some(shell_cfg) = &cfg.shell
&& let Some(user_allowlist) = shell_cfg.non_login_allowlist.as_ref()
{
for cmd in user_allowlist {
let trimmed = cmd.trim();
if !trimmed.is_empty() {
merged.insert(trimmed.to_string());
}
}
}
merged.into_iter().collect()
};
let resolved_cwd = {
use std::env;
@@ -1183,6 +1210,7 @@ impl Config {
did_user_set_custom_approval_policy_or_sandbox_mode,
forced_auto_mode_downgraded_on_windows,
shell_environment_policy,
non_login_shell_allowlist,
notify: cfg.notify,
user_instructions,
base_instructions,
@@ -2963,6 +2991,10 @@ model_verbosity = "high"
did_user_set_custom_approval_policy_or_sandbox_mode: true,
forced_auto_mode_downgraded_on_windows: false,
shell_environment_policy: ShellEnvironmentPolicy::default(),
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
.iter()
.map(std::string::ToString::to_string)
.collect(),
user_instructions: None,
notify: None,
cwd: fixture.cwd(),
@@ -3037,6 +3069,10 @@ model_verbosity = "high"
did_user_set_custom_approval_policy_or_sandbox_mode: true,
forced_auto_mode_downgraded_on_windows: false,
shell_environment_policy: ShellEnvironmentPolicy::default(),
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
.iter()
.map(std::string::ToString::to_string)
.collect(),
user_instructions: None,
notify: None,
cwd: fixture.cwd(),
@@ -3126,6 +3162,10 @@ model_verbosity = "high"
did_user_set_custom_approval_policy_or_sandbox_mode: true,
forced_auto_mode_downgraded_on_windows: false,
shell_environment_policy: ShellEnvironmentPolicy::default(),
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
.iter()
.map(std::string::ToString::to_string)
.collect(),
user_instructions: None,
notify: None,
cwd: fixture.cwd(),
@@ -3201,6 +3241,10 @@ model_verbosity = "high"
did_user_set_custom_approval_policy_or_sandbox_mode: true,
forced_auto_mode_downgraded_on_windows: false,
shell_environment_policy: ShellEnvironmentPolicy::default(),
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
.iter()
.map(std::string::ToString::to_string)
.collect(),
user_instructions: None,
notify: None,
cwd: fixture.cwd(),

View File

@@ -459,6 +459,15 @@ pub struct ShellEnvironmentPolicyToml {
pub experimental_use_profile: Option<bool>,
}
#[derive(Deserialize, Debug, Clone, Default, PartialEq)]
#[serde(rename_all = "kebab-case")]
pub struct ShellConfigToml {
/// Additional commands that can skip login shells when the
/// `non_login_shell_heuristic` feature flag is enabled.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub non_login_allowlist: Option<Vec<String>>,
}
pub type EnvironmentVariablePattern = WildMatchPattern<'*', '?'>;
/// Deriving the `env` based on this policy works as follows:

View File

@@ -55,6 +55,8 @@ pub enum Feature {
Skills,
/// Send warnings to the model to correct it on the tool usage.
ModelWarnings,
/// Skip login shell for known-safe commands when enabled.
NonLoginShellHeuristic,
}
impl Feature {
@@ -342,4 +344,10 @@ pub const FEATURES: &[FeatureSpec] = &[
stage: Stage::Experimental,
default_enabled: false,
},
FeatureSpec {
id: Feature::NonLoginShellHeuristic,
key: "non_login_shell_heuristic",
stage: Stage::Experimental,
default_enabled: false,
},
];

View File

@@ -1,5 +1,8 @@
use crate::is_safe_command::is_known_safe_command;
use serde::Deserialize;
use serde::Serialize;
use shlex::split as shlex_split;
use std::path::Path;
use std::path::PathBuf;
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
@@ -60,6 +63,51 @@ impl Shell {
}
}
pub fn should_use_login_shell(
requested_login_shell: bool,
command: &str,
shell: &Shell,
heuristic_enabled: bool,
allowlist: &[String],
) -> bool {
if !requested_login_shell {
return false;
}
if !heuristic_enabled {
return true;
}
if !matches!(
shell.shell_type,
ShellType::Bash | ShellType::Zsh | ShellType::Sh
) {
return requested_login_shell;
}
let is_allowlisted = shlex_split(command)
.and_then(|tokens| {
tokens.first().map(|token| {
let token_name = Path::new(token)
.file_name()
.and_then(|name| name.to_str())
.unwrap_or(token.as_str());
allowlist
.iter()
.any(|cmd| cmd == token || cmd == token_name)
})
})
.unwrap_or(false);
if !is_allowlisted {
return true;
}
let derived = shell.derive_exec_args(command, false);
if is_known_safe_command(&derived) {
return false;
}
true
}
#[cfg(unix)]
fn get_user_shell_path() -> Option<PathBuf> {
use libc::getpwuid;
@@ -359,7 +407,8 @@ mod tests {
assert!(
shell_path == PathBuf::from("/bin/bash")
|| shell_path == PathBuf::from("/usr/bin/bash")
|| shell_path == PathBuf::from("/usr/local/bin/bash"),
|| shell_path == PathBuf::from("/usr/local/bin/bash")
|| shell_path == PathBuf::from("/opt/homebrew/bin/bash"),
"shell path: {shell_path:?}",
);
}
@@ -451,4 +500,23 @@ mod tests {
assert!(shell_path.ends_with("pwsh.exe") || shell_path.ends_with("powershell.exe"));
}
#[test]
fn non_login_shell_heuristic_respects_allowlist() {
let shell = Shell {
shell_type: ShellType::Bash,
shell_path: PathBuf::from("/bin/bash"),
};
let allowlist = vec!["ls".to_string()];
assert!(!should_use_login_shell(
true, "ls -la", &shell, true, &allowlist
));
assert!(should_use_login_shell(
true,
"awk '{print $1}' file.txt",
&shell,
true,
&allowlist
));
}
}

View File

@@ -24,6 +24,7 @@ use crate::protocol::ExecCommandSource;
use crate::protocol::SandboxPolicy;
use crate::protocol::TaskStartedEvent;
use crate::sandboxing::ExecEnv;
use crate::shell::should_use_login_shell;
use crate::state::TaskKind;
use crate::tools::format_exec_output_str;
use crate::user_shell_command::user_shell_command_record_item;
@@ -66,7 +67,13 @@ impl SessionTask for UserShellCommandTask {
// Execute the user's script under their default shell when known; this
// allows commands that use shell features (pipes, &&, redirects, etc.).
// We do not source rc files or otherwise reformat the script.
let use_login_shell = true;
let use_login_shell = should_use_login_shell(
true,
&self.command,
session.user_shell(),
turn_context.non_login_shell_heuristic_enabled,
&turn_context.non_login_shell_allowlist,
);
let command = session
.user_shell()
.derive_exec_args(&self.command, use_login_shell);

View File

@@ -11,6 +11,7 @@ use crate::function_tool::FunctionCallError;
use crate::is_safe_command::is_known_safe_command;
use crate::protocol::ExecCommandSource;
use crate::sandboxing::SandboxPermissions;
use crate::shell::should_use_login_shell;
use crate::tools::context::ToolInvocation;
use crate::tools::context::ToolOutput;
use crate::tools::context::ToolPayload;
@@ -49,7 +50,13 @@ impl ShellCommandHandler {
turn_context: &TurnContext,
) -> ExecParams {
let shell = session.user_shell();
let use_login_shell = true;
let use_login_shell = should_use_login_shell(
true,
&params.command,
shell,
turn_context.non_login_shell_heuristic_enabled,
&turn_context.non_login_shell_allowlist,
);
let command = shell.derive_exec_args(&params.command, use_login_shell);
ExecParams {

View File

@@ -8,6 +8,7 @@ use crate::protocol::ExecCommandSource;
use crate::protocol::ExecOutputStream;
use crate::shell::default_user_shell;
use crate::shell::get_shell_by_model_provided_path;
use crate::shell::should_use_login_shell;
use crate::tools::context::ToolInvocation;
use crate::tools::context::ToolOutput;
use crate::tools::context::ToolPayload;
@@ -93,7 +94,15 @@ impl ToolHandler for UnifiedExecHandler {
let Ok(params) = serde_json::from_str::<ExecCommandArgs>(arguments) else {
return true;
};
let command = get_command(&params);
let shell = resolve_shell(&params);
let use_login_shell = should_use_login_shell(
params.login,
&params.cmd,
&shell,
invocation.turn.non_login_shell_heuristic_enabled,
&invocation.turn.non_login_shell_allowlist,
);
let command = shell.derive_exec_args(&params.cmd, use_login_shell);
!is_known_safe_command(&command)
}
@@ -130,7 +139,15 @@ impl ToolHandler for UnifiedExecHandler {
})?;
let process_id = manager.allocate_process_id().await;
let command = get_command(&args);
let shell = resolve_shell(&args);
let use_login_shell = should_use_login_shell(
args.login,
&args.cmd,
&shell,
turn.non_login_shell_heuristic_enabled,
&turn.non_login_shell_allowlist,
);
let command = shell.derive_exec_args(&args.cmd, use_login_shell);
let ExecCommandArgs {
workdir,
yield_time_ms,
@@ -253,14 +270,18 @@ impl ToolHandler for UnifiedExecHandler {
}
}
#[cfg(test)]
fn get_command(args: &ExecCommandArgs) -> Vec<String> {
let shell = if let Some(shell_str) = &args.shell {
let shell = resolve_shell(args);
shell.derive_exec_args(&args.cmd, args.login)
}
fn resolve_shell(args: &ExecCommandArgs) -> crate::shell::Shell {
if let Some(shell_str) = &args.shell {
get_shell_by_model_provided_path(&PathBuf::from(shell_str))
} else {
default_user_shell()
};
shell.derive_exec_args(&args.cmd, args.login)
}
}
fn format_response(response: &UnifiedExecResponse) -> String {

View File

@@ -0,0 +1,615 @@
#!/usr/bin/env python3
"""
Run codex exec end-to-end with and without the non_login_shell_heuristic feature.
This script repeatedly runs:
- codex exec with the heuristic disabled
- codex exec with the heuristic enabled
for one or more prompts and reports per-run wall times, per-prompt summaries,
and combined summary statistics.
Runs can be parallelized (concurrency > 1) to speed up sampling, but note that
parallel rollouts may contend for local/remote resources and slightly skew
latency compared to strictly serial runs.
Usage:
python scripts/bench_codex_exec_non_login.py \
--iterations 5 \
--prompt "Read and summarize codex_berry" \
--prompt "Explain the architecture" \
--workdir /path/to/repo \
--model gpt-5.1-codex-max \
--reasoning-effort high \
--concurrency 2 \
--codex-bin /path/to/codex \
--skip-feature-toggle # when benchmarking an older codex binary
Notes:
- Runs will incur network/LLM variance; prefer N >= 510.
- Requires the `codex` binary (local path supported) and valid credentials.
- Results are printed in seconds; failures are logged and included in the output.
"""
import argparse
import asyncio
import json
import math
import statistics
import sys
import time
from dataclasses import dataclass
from typing import List
from typing import Optional
@dataclass
class RunResult:
duration: float
exit_code: int
stderr: str
command_time: float
command_count: int
mcp_call_count: int
usage: Optional["Usage"]
@dataclass
class Usage:
input_tokens: int
cached_input_tokens: int
output_tokens: int
async def run_once(
base_cmd: List[str],
prompt: str,
feature_enabled: bool,
toggle_feature: bool,
sem: asyncio.Semaphore,
) -> RunResult:
cmd = list(base_cmd)
if toggle_feature:
cmd.extend(
[
"-c",
f"features.non_login_shell_heuristic={'true' if feature_enabled else 'false'}",
]
)
cmd.append(prompt)
async with sem:
start = time.perf_counter()
proc = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
async def read_stdout() -> tuple[float, int, int, Optional[Usage]]:
command_start_times: dict[str, float] = {}
command_time = 0.0
command_count = 0
mcp_call_count = 0
usage: Optional[Usage] = None
buffer = b""
def handle_line(raw_line: bytes) -> None:
nonlocal command_time, command_count, mcp_call_count, usage
ts = time.perf_counter()
line = raw_line.decode(errors="replace").strip()
if not line:
return
try:
event = json.loads(line)
except json.JSONDecodeError:
return
event_type = event.get("type")
if event_type == "turn.completed" and (usage_info := event.get("usage")):
usage = Usage(
input_tokens=usage_info.get("input_tokens", 0),
cached_input_tokens=usage_info.get("cached_input_tokens", 0),
output_tokens=usage_info.get("output_tokens", 0),
)
if not event_type or not event_type.startswith("item."):
return
item = event.get("item") or {}
item_type = item.get("type")
item_id = item.get("id")
if item_type == "command_execution":
if event_type == "item.started" and item_id:
command_start_times[item_id] = ts
command_count += 1
elif event_type == "item.completed" and item_id:
if start_ts := command_start_times.pop(item_id, None):
command_time += ts - start_ts
elif item_type == "mcp_tool_call":
if event_type == "item.started":
mcp_call_count += 1
assert proc.stdout is not None
try:
while True:
chunk = await proc.stdout.read(4096)
if not chunk:
break
buffer += chunk
lines = buffer.split(b"\n")
buffer = lines.pop() if lines else b""
for raw_line in lines:
handle_line(raw_line)
except asyncio.CancelledError:
# Allow cancellation to propagate but keep what we have so far.
raise
finally:
if buffer:
handle_line(buffer)
return command_time, command_count, mcp_call_count, usage
stdout_task = asyncio.create_task(read_stdout())
stderr_task = (
asyncio.create_task(proc.stderr.read()) if proc.stderr is not None else None
)
await proc.wait()
stderr_bytes = await stderr_task if stderr_task is not None else b""
command_time, command_count, mcp_call_count, usage = await stdout_task
duration = time.perf_counter() - start
stderr_text = stderr_bytes.decode(errors="replace").strip() if stderr_bytes else ""
return RunResult(
duration=duration,
exit_code=proc.returncode,
stderr=stderr_text,
command_time=command_time,
command_count=command_count,
mcp_call_count=mcp_call_count,
usage=usage,
)
async def run_case(
label: str,
base_cmd: List[str],
prompt: str,
iterations: int,
feature_enabled: bool,
toggle_feature: bool,
concurrency: int,
) -> tuple[
list[float], list[float], list[float], list[int], list[int], list[Usage], int
]:
sem = asyncio.Semaphore(concurrency)
tasks = [
asyncio.create_task(
run_once(base_cmd, prompt, feature_enabled, toggle_feature, sem)
)
for _ in range(iterations)
]
durations: List[float] = []
command_times: List[float] = []
command_times_per_cmd: List[float] = []
command_counts: List[int] = []
mcp_call_counts: List[int] = []
usages: List[Usage] = []
failures = 0
for idx, task in enumerate(asyncio.as_completed(tasks), start=1):
result = await task
durations.append(result.duration)
command_times.append(result.command_time)
if result.command_count > 0:
command_times_per_cmd.append(result.command_time / result.command_count)
command_counts.append(result.command_count)
mcp_call_counts.append(result.mcp_call_count)
if result.usage:
usages.append(result.usage)
status = "ok" if result.exit_code == 0 else f"fail ({result.exit_code})"
cmd_time_str = f" cmd_time={result.command_time:.3f}s cmds={result.command_count}"
mcp_str = f" mcp_calls={result.mcp_call_count}" if result.mcp_call_count else ""
print(
f"[{label}] run {idx}/{iterations}: {result.duration:.3f}s"
f" [{status}]{cmd_time_str}{mcp_str}"
)
if result.exit_code != 0:
failures += 1
if result.stderr:
print(f" stderr: {result.stderr}", file=sys.stderr)
return (
durations,
command_times,
command_times_per_cmd,
command_counts,
mcp_call_counts,
usages,
failures,
)
def summarize(label: str, durations: List[float]) -> None:
if not durations:
print(f"[{label}] no runs recorded")
return
mean = statistics.mean(durations)
median = statistics.median(durations)
if len(durations) < 2:
p95 = durations[0]
else:
p95 = statistics.quantiles(durations, n=100)[94]
margin = confidence_margin(durations)
print(
f"[{label}] n={len(durations)} "
f"mean={mean:.3f}{margin:.3f}s median={median:.3f}s p95={p95:.3f}s"
)
def summarize_command_time(label: str, command_times: List[float], command_counts: List[int]) -> None:
if not command_times:
print(f"[{label}] command time: no runs recorded")
return
mean = statistics.mean(command_times)
median = statistics.median(command_times)
p95 = command_times[0] if len(command_times) < 2 else statistics.quantiles(command_times, n=100)[94]
avg_cmds = statistics.mean(command_counts) if command_counts else 0
margin = confidence_margin(command_times)
print(
f"[{label}] command time n={len(command_times)} "
f"mean={mean:.3f}{margin:.3f}s median={median:.3f}s p95={p95:.3f}s avg_cmds={avg_cmds:.2f}"
)
def summarize_command_time_per_command(label: str, command_times_per_cmd: List[float]) -> None:
if not command_times_per_cmd:
print(f"[{label}] command time per command: no runs recorded")
return
mean = statistics.mean(command_times_per_cmd)
median = statistics.median(command_times_per_cmd)
p95 = (
command_times_per_cmd[0]
if len(command_times_per_cmd) < 2
else statistics.quantiles(command_times_per_cmd, n=100)[94]
)
margin = confidence_margin(command_times_per_cmd)
print(
f"[{label}] command time per command n={len(command_times_per_cmd)} "
f"mean={mean:.3f}{margin:.3f}s median={median:.3f}s p95={p95:.3f}s"
)
def summarize_usage(label: str, usages: List[Usage]) -> None:
if not usages:
print(f"[{label}] tokens: no runs recorded")
return
avg_input = statistics.mean(u.input_tokens for u in usages)
avg_cached = statistics.mean(u.cached_input_tokens for u in usages)
avg_output = statistics.mean(u.output_tokens for u in usages)
print(
f"[{label}] tokens avg input={avg_input:.1f} cached={avg_cached:.1f} output={avg_output:.1f}"
)
def summarize_mcp_calls(label: str, mcp_counts: List[int]) -> None:
if not mcp_counts:
print(f"[{label}] mcp calls: no runs recorded")
return
avg_mcp = statistics.mean(mcp_counts)
max_mcp = max(mcp_counts)
print(f"[{label}] mcp calls avg={avg_mcp:.2f} max={max_mcp}")
def format_prompt_label(prompt: str, idx: int) -> str:
snippet = prompt.strip()
if len(snippet) > 60:
snippet = snippet[:57] + "..."
return f"prompt {idx + 1}: {snippet}"
def confidence_margin(values: List[float]) -> float:
if len(values) < 2:
return 0.0
stdev = statistics.stdev(values)
return 1.96 * stdev / math.sqrt(len(values))
def stats(values: List[float]) -> dict:
if not values:
return {
"n": 0,
"mean": None,
"median": None,
"p95": None,
"margin": None,
}
mean = statistics.mean(values)
median = statistics.median(values)
if len(values) < 2:
p95 = values[0]
else:
p95 = statistics.quantiles(values, n=100)[94]
margin = confidence_margin(values)
return {"n": len(values), "mean": mean, "median": median, "p95": p95, "margin": margin}
def safe_avg(values: List[float]) -> Optional[float]:
return statistics.mean(values) if values else None
def fmt_stat(s: dict) -> str:
if not s["n"]:
return "-"
return f"{s['mean']:.3f}±{s['margin']:.3f}"
def fmt_avg(value: Optional[float]) -> str:
return f"{value:.2f}" if value is not None else "-"
def fmt_int(value: Optional[int]) -> str:
return str(value) if value is not None else "-"
def render_table(headers: List[str], rows: List[List[str]]) -> None:
widths = [len(h) for h in headers]
for row in rows:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(cell))
fmt = " ".join(f"{{:<{w}}}" for w in widths)
print(fmt.format(*headers))
print(fmt.format(*["-" * w for w in widths]))
for row in rows:
print(fmt.format(*row))
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Benchmark codex exec with/without non_login_shell_heuristic."
)
parser.add_argument(
"--codex-bin",
default="codex",
help="Path to the codex binary (default: codex on PATH).",
)
parser.add_argument(
"--iterations",
type=int,
default=5,
help="Number of runs per configuration (default: 5)",
)
parser.add_argument(
"--prompt",
dest="prompts",
action="append",
required=True,
help="Prompt to send to codex exec (repeat for multiple prompts).",
)
parser.add_argument(
"--workdir",
default=None,
help="Working directory for codex exec (--cd); defaults to current dir.",
)
parser.add_argument(
"--model",
default=None,
help="Optional model override passed to codex exec (--model).",
)
parser.add_argument(
"--reasoning-effort",
choices=["none", "minimal", "low", "medium", "high", "xhigh"],
default=None,
help="Optional reasoning effort override passed via -c model_reasoning_effort=...",
)
parser.add_argument(
"--extra-config",
action="append",
default=[],
help="Additional -c overrides passed to codex exec (repeatable).",
)
parser.add_argument(
"--concurrency",
type=int,
default=1,
help="Number of concurrent runs to launch per configuration (default: 1).",
)
parser.add_argument(
"--skip-feature-toggle",
action="store_true",
help=(
"Do not inject the non_login_shell_heuristic flag; "
"use when benchmarking an older codex binary."
),
)
parser.add_argument(
"--verbose",
action="store_true",
help="Print codex exec stdout/stderr for debugging.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
base_cmd: List[str] = [args.codex_bin, "exec"]
if args.workdir:
base_cmd.extend(["--cd", args.workdir])
if args.model:
base_cmd.extend(["--model", args.model])
if args.reasoning_effort:
base_cmd.extend(["-c", f"model_reasoning_effort={args.reasoning_effort}"])
for override in args.extra_config:
base_cmd.extend(["-c", override])
base_cmd.append("--json")
toggle_feature = not args.skip_feature_toggle
prompts = args.prompts
print(
f"Running {args.iterations} iterations per config for {len(prompts)} prompt(s)..."
)
all_off_durations: List[float] = []
all_on_durations: List[float] = []
all_off_command_times: List[float] = []
all_on_command_times: List[float] = []
all_off_command_times_per_cmd: List[float] = []
all_on_command_times_per_cmd: List[float] = []
all_off_command_counts: List[int] = []
all_on_command_counts: List[int] = []
all_off_mcp_calls: List[int] = []
all_on_mcp_calls: List[int] = []
all_off_usages: List[Usage] = []
all_on_usages: List[Usage] = []
total_off_failures = 0
total_on_failures = 0
per_prompt_results: list[dict] = []
for idx, prompt in enumerate(prompts):
label = format_prompt_label(prompt, idx)
print(f"\nPrompt {idx + 1}/{len(prompts)}: {label}")
(
off_durations,
off_cmd_times,
off_cmd_times_per_cmd,
off_cmd_counts,
off_mcp_calls,
off_usages,
off_failures,
) = asyncio.run(
run_case(
f"login-shell | {label}",
base_cmd,
prompt,
args.iterations,
feature_enabled=False,
toggle_feature=toggle_feature,
concurrency=args.concurrency,
)
)
(
on_durations,
on_cmd_times,
on_cmd_times_per_cmd,
on_cmd_counts,
on_mcp_calls,
on_usages,
on_failures,
) = asyncio.run(
run_case(
f"non-login-shell | {label}",
base_cmd,
prompt,
args.iterations,
feature_enabled=True,
toggle_feature=toggle_feature,
concurrency=args.concurrency,
)
)
per_prompt_results.append(
{
"label": label,
"off_durations": off_durations,
"on_durations": on_durations,
"off_cmd_times": off_cmd_times,
"on_cmd_times": on_cmd_times,
"off_cmd_times_per_cmd": off_cmd_times_per_cmd,
"on_cmd_times_per_cmd": on_cmd_times_per_cmd,
"off_cmd_counts": off_cmd_counts,
"on_cmd_counts": on_cmd_counts,
"off_mcp_calls": off_mcp_calls,
"on_mcp_calls": on_mcp_calls,
"off_usages": off_usages,
"on_usages": on_usages,
"off_failures": off_failures,
"on_failures": on_failures,
}
)
all_off_durations.extend(off_durations)
all_on_durations.extend(on_durations)
all_off_command_times.extend(off_cmd_times)
all_on_command_times.extend(on_cmd_times)
all_off_command_times_per_cmd.extend(off_cmd_times_per_cmd)
all_on_command_times_per_cmd.extend(on_cmd_times_per_cmd)
all_off_command_counts.extend(off_cmd_counts)
all_on_command_counts.extend(on_cmd_counts)
all_off_mcp_calls.extend(off_mcp_calls)
all_on_mcp_calls.extend(on_mcp_calls)
all_off_usages.extend(off_usages)
all_on_usages.extend(on_usages)
total_off_failures += off_failures
total_on_failures += on_failures
print("\nPer-prompt summary (means ±95% CI):")
per_prompt_rows: List[List[str]] = []
for s in per_prompt_results:
per_prompt_rows.append(
[
s["label"],
fmt_stat(stats(s["off_durations"])),
fmt_stat(stats(s["on_durations"])),
fmt_stat(stats(s["off_cmd_times"])),
fmt_stat(stats(s["on_cmd_times"])),
fmt_stat(stats(s["off_cmd_times_per_cmd"])),
fmt_stat(stats(s["on_cmd_times_per_cmd"])),
fmt_avg(safe_avg(s["off_cmd_counts"])),
fmt_avg(safe_avg(s["on_cmd_counts"])),
fmt_avg(safe_avg(s["off_mcp_calls"])),
fmt_avg(safe_avg(s["on_mcp_calls"])),
]
)
render_table(
[
"prompt",
"wall login",
"wall non",
"cmd login",
"cmd non",
"cmd/call login",
"cmd/call non",
"avg cmds login",
"avg cmds non",
"avg mcp login",
"avg mcp non",
],
per_prompt_rows,
)
print("\nCombined summary (means ±95% CI):")
combined_rows = [
[
"all prompts",
fmt_stat(stats(all_off_durations)),
fmt_stat(stats(all_on_durations)),
fmt_stat(stats(all_off_command_times)),
fmt_stat(stats(all_on_command_times)),
fmt_stat(stats(all_off_command_times_per_cmd)),
fmt_stat(stats(all_on_command_times_per_cmd)),
fmt_avg(safe_avg(all_off_command_counts)),
fmt_avg(safe_avg(all_on_command_counts)),
fmt_avg(safe_avg(all_off_mcp_calls)),
fmt_avg(safe_avg(all_on_mcp_calls)),
]
]
render_table(
[
"scope",
"wall login",
"wall non",
"cmd login",
"cmd non",
"cmd/call login",
"cmd/call non",
"avg cmds login",
"avg cmds non",
"avg mcp login",
"avg mcp non",
],
combined_rows,
)
print(f"\nlogin-shell failures (all prompts): {total_off_failures}")
print(f"non-login-shell failures (all prompts): {total_on_failures}")
return 0 if (total_off_failures + total_on_failures) == 0 else 1
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -48,6 +48,7 @@ Supported features:
| `web_search_request` | false | Stable | Allow the model to issue web searches |
| `experimental_sandbox_command_assessment` | false | Experimental | Enable model-based sandbox risk assessment |
| `ghost_commit` | false | Experimental | Create a ghost commit each turn |
| `non_login_shell_heuristic` | false | Experimental | Skip login shells for allow-listed safe commands |
| `enable_experimental_windows_sandbox` | false | Experimental | Use the Windows restricted-token sandbox |
Notes:
@@ -416,6 +417,28 @@ set = { PATH = "/usr/bin", MY_FLAG = "1" }
Currently, `CODEX_SANDBOX_NETWORK_DISABLED=1` is also added to the environment, assuming network is disabled. This is not configurable.
### shell.non_login_allowlist
When the `non_login_shell_heuristic` feature flag is enabled, Codex will skip `-l`
for obviously safe commands (e.g., `ls`, `rg --files`) to avoid loading login
profiles on every exec. You can extend the allow-list via `config.toml`:
```toml
[shell]
non_login_allowlist = ["ls", "rg", "git", "cat", "rm", "rmdir"]
```
The defaults include common read-only commands (`ls`, `cat`, `rg`, `git status`,
`sed -n`, `rm <file>`, `rmdir <dir>`, etc.); entries you add are merged on top.
The heuristic only applies when the feature flag is enabled, and it keeps using
login shells for commands that are not known-safe.
You can compare login vs non-login startup costs locally via:
```bash
cargo run -p codex-core --example non_login_shell_bench -- --iterations 5
```
## MCP integration
### mcp_servers