mirror of
https://github.com/openai/codex.git
synced 2026-02-01 22:47:52 +00:00
Compare commits
2 Commits
rust-v0.68
...
safe-calls
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9089c1aaec | ||
|
|
6a88b0f465 |
80
codex-rs/core/examples/non_login_shell_bench.rs
Normal file
80
codex-rs/core/examples/non_login_shell_bench.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use codex_core::exec::DEFAULT_EXEC_COMMAND_TIMEOUT_MS;
|
||||
use codex_core::exec::ExecExpiration;
|
||||
use codex_core::exec::ExecParams;
|
||||
use codex_core::exec::process_exec_tool_call;
|
||||
use codex_core::protocol::SandboxPolicy;
|
||||
use codex_core::shell::default_user_shell;
|
||||
|
||||
fn parse_iterations() -> usize {
|
||||
let mut args = std::env::args().skip(1);
|
||||
let mut iterations = 5usize;
|
||||
while let Some(arg) = args.next() {
|
||||
if arg == "--iterations" {
|
||||
if let Some(value) = args.next() {
|
||||
if let Ok(parsed) = value.parse::<usize>() {
|
||||
iterations = parsed.max(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
iterations
|
||||
}
|
||||
|
||||
async fn measure(command: &str, use_login_shell: bool) -> anyhow::Result<Duration> {
|
||||
let cwd = std::env::current_dir()?;
|
||||
let env: HashMap<String, String> = std::env::vars().collect();
|
||||
let shell = default_user_shell();
|
||||
let args = shell.derive_exec_args(command, use_login_shell);
|
||||
let params = ExecParams {
|
||||
command: args,
|
||||
cwd,
|
||||
expiration: ExecExpiration::from(DEFAULT_EXEC_COMMAND_TIMEOUT_MS),
|
||||
env,
|
||||
with_escalated_permissions: None,
|
||||
justification: None,
|
||||
arg0: None,
|
||||
};
|
||||
let start = Instant::now();
|
||||
let _ = process_exec_tool_call(
|
||||
params,
|
||||
&SandboxPolicy::DangerFullAccess,
|
||||
&std::env::current_dir()?,
|
||||
&None,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
Ok(start.elapsed())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let iterations = parse_iterations();
|
||||
let commands = ["ls", "rg --files", "git status"];
|
||||
|
||||
println!("Non-login shell heuristic benchmark (iterations: {iterations})");
|
||||
for command in commands {
|
||||
let mut login_times = Vec::with_capacity(iterations);
|
||||
let mut non_login_times = Vec::with_capacity(iterations);
|
||||
for _ in 0..iterations {
|
||||
login_times.push(measure(command, true).await?);
|
||||
non_login_times.push(measure(command, false).await?);
|
||||
}
|
||||
let avg_login_ms =
|
||||
login_times.iter().map(Duration::as_secs_f64).sum::<f64>() * 1000.0 / iterations as f64;
|
||||
let avg_non_login_ms = non_login_times
|
||||
.iter()
|
||||
.map(Duration::as_secs_f64)
|
||||
.sum::<f64>()
|
||||
* 1000.0
|
||||
/ iterations as f64;
|
||||
println!(
|
||||
"{command:12} login: {avg_login_ms:>8.3} ms | non-login: {avg_non_login_ms:>8.3} ms"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -289,6 +289,8 @@ pub(crate) struct TurnContext {
|
||||
pub(crate) approval_policy: AskForApproval,
|
||||
pub(crate) sandbox_policy: SandboxPolicy,
|
||||
pub(crate) shell_environment_policy: ShellEnvironmentPolicy,
|
||||
pub(crate) non_login_shell_allowlist: Vec<String>,
|
||||
pub(crate) non_login_shell_heuristic_enabled: bool,
|
||||
pub(crate) tools_config: ToolsConfig,
|
||||
pub(crate) final_output_json_schema: Option<Value>,
|
||||
pub(crate) codex_linux_sandbox_exe: Option<PathBuf>,
|
||||
@@ -405,6 +407,7 @@ impl Session {
|
||||
) -> TurnContext {
|
||||
let config = session_configuration.original_config_do_not_use.clone();
|
||||
let features = &config.features;
|
||||
let non_login_shell_heuristic_enabled = features.enabled(Feature::NonLoginShellHeuristic);
|
||||
let model_family = models_manager.construct_model_family(&session_configuration.model);
|
||||
let mut per_turn_config = (*config).clone();
|
||||
per_turn_config.model = session_configuration.model.clone();
|
||||
@@ -448,6 +451,8 @@ impl Session {
|
||||
approval_policy: session_configuration.approval_policy,
|
||||
sandbox_policy: session_configuration.sandbox_policy.clone(),
|
||||
shell_environment_policy: config.shell_environment_policy.clone(),
|
||||
non_login_shell_allowlist: config.non_login_shell_allowlist.clone(),
|
||||
non_login_shell_heuristic_enabled,
|
||||
tools_config,
|
||||
final_output_json_schema: None,
|
||||
codex_linux_sandbox_exe: config.codex_linux_sandbox_exe.clone(),
|
||||
@@ -1888,6 +1893,8 @@ async fn spawn_review_thread(
|
||||
approval_policy: parent_turn_context.approval_policy,
|
||||
sandbox_policy: parent_turn_context.sandbox_policy.clone(),
|
||||
shell_environment_policy: parent_turn_context.shell_environment_policy.clone(),
|
||||
non_login_shell_allowlist: parent_turn_context.non_login_shell_allowlist.clone(),
|
||||
non_login_shell_heuristic_enabled: parent_turn_context.non_login_shell_heuristic_enabled,
|
||||
cwd: parent_turn_context.cwd.clone(),
|
||||
final_output_json_schema: None,
|
||||
codex_linux_sandbox_exe: parent_turn_context.codex_linux_sandbox_exe.clone(),
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
use crate::bash::parse_shell_lc_plain_commands;
|
||||
use crate::command_safety::windows_safe_commands::is_safe_command_windows;
|
||||
|
||||
pub const DEFAULT_NON_LOGIN_SHELL_ALLOWLIST: &[&str] = &[
|
||||
"cat", "cd", "echo", "false", "find", "git", "grep", "head", "ls", "nl", "pwd", "rg", "rm",
|
||||
"rmdir", "sed", "tail", "true", "wc",
|
||||
];
|
||||
|
||||
pub fn is_known_safe_command(command: &[String]) -> bool {
|
||||
let command: Vec<String> = command
|
||||
.iter()
|
||||
@@ -43,6 +48,20 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
|
||||
return false;
|
||||
};
|
||||
|
||||
if cmd0 == "rm" {
|
||||
return command
|
||||
.iter()
|
||||
.skip(1)
|
||||
.all(|arg| !arg.starts_with('-') && !arg.starts_with("--"));
|
||||
}
|
||||
|
||||
if cmd0 == "rmdir" {
|
||||
return command
|
||||
.iter()
|
||||
.skip(1)
|
||||
.all(|arg| !arg.starts_with('-') && !arg.starts_with("--"));
|
||||
}
|
||||
|
||||
match std::path::Path::new(&cmd0)
|
||||
.file_name()
|
||||
.and_then(|osstr| osstr.to_str())
|
||||
@@ -60,8 +79,7 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
|
||||
"pwd" |
|
||||
"tail" |
|
||||
"true" |
|
||||
"wc" |
|
||||
"which") => {
|
||||
"wc") => {
|
||||
true
|
||||
},
|
||||
|
||||
@@ -113,9 +131,6 @@ fn is_safe_to_call_with_exec(command: &[String]) -> bool {
|
||||
Some("branch" | "status" | "log" | "diff" | "show")
|
||||
),
|
||||
|
||||
// Rust
|
||||
Some("cargo") if command.get(1).map(String::as_str) == Some("check") => true,
|
||||
|
||||
// Special-case `sed -n {N|M,N}p`
|
||||
Some("sed")
|
||||
if {
|
||||
@@ -267,6 +282,17 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rm_rules() {
|
||||
assert!(is_safe_to_call_with_exec(&vec_str(&["rm", "file.txt"])));
|
||||
assert!(!is_safe_to_call_with_exec(&vec_str(&["rm", "-rf", "/"])));
|
||||
|
||||
assert!(is_safe_to_call_with_exec(&vec_str(&["rmdir", "tmp_dir"])));
|
||||
assert!(!is_safe_to_call_with_exec(&vec_str(&[
|
||||
"rmdir", "-p", "tmp_dir"
|
||||
])));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn windows_powershell_full_path_is_safe() {
|
||||
if !cfg!(windows) {
|
||||
@@ -313,6 +339,16 @@ mod tests {
|
||||
])));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bash_c_safe_examples() {
|
||||
assert!(is_known_safe_command(&vec_str(&["bash", "-c", "ls"])));
|
||||
assert!(is_known_safe_command(&vec_str(&[
|
||||
"bash",
|
||||
"-c",
|
||||
"git status"
|
||||
])));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bash_lc_safe_examples_with_operators() {
|
||||
assert!(is_known_safe_command(&vec_str(&[
|
||||
|
||||
@@ -48,12 +48,14 @@ use dunce::canonicalize;
|
||||
use serde::Deserialize;
|
||||
use similar::DiffableStr;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::io::ErrorKind;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::config::profile::ConfigProfile;
|
||||
use crate::is_safe_command::DEFAULT_NON_LOGIN_SHELL_ALLOWLIST;
|
||||
use toml::Value as TomlValue;
|
||||
use toml_edit::DocumentMut;
|
||||
|
||||
@@ -109,6 +111,9 @@ pub struct Config {
|
||||
pub forced_auto_mode_downgraded_on_windows: bool,
|
||||
|
||||
pub shell_environment_policy: ShellEnvironmentPolicy,
|
||||
/// Commands considered safe to run without a login shell when the heuristic
|
||||
/// feature flag is enabled.
|
||||
pub non_login_shell_allowlist: Vec<String>,
|
||||
|
||||
/// When `true`, `AgentReasoning` events emitted by the backend will be
|
||||
/// suppressed from the frontend output. This can reduce visual noise when
|
||||
@@ -584,6 +589,10 @@ pub struct ConfigToml {
|
||||
#[serde(default)]
|
||||
pub shell_environment_policy: ShellEnvironmentPolicyToml,
|
||||
|
||||
/// Shell-specific configuration.
|
||||
#[serde(default)]
|
||||
pub shell: Option<crate::config::types::ShellConfigToml>,
|
||||
|
||||
/// Sandbox mode to use.
|
||||
pub sandbox_mode: Option<SandboxMode>,
|
||||
|
||||
@@ -998,6 +1007,24 @@ impl Config {
|
||||
crate::safety::set_windows_sandbox_enabled(features.enabled(Feature::WindowsSandbox));
|
||||
}
|
||||
|
||||
let non_login_shell_allowlist: Vec<String> = {
|
||||
let mut merged = BTreeSet::new();
|
||||
for cmd in DEFAULT_NON_LOGIN_SHELL_ALLOWLIST {
|
||||
merged.insert((*cmd).to_string());
|
||||
}
|
||||
if let Some(shell_cfg) = &cfg.shell
|
||||
&& let Some(user_allowlist) = shell_cfg.non_login_allowlist.as_ref()
|
||||
{
|
||||
for cmd in user_allowlist {
|
||||
let trimmed = cmd.trim();
|
||||
if !trimmed.is_empty() {
|
||||
merged.insert(trimmed.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
merged.into_iter().collect()
|
||||
};
|
||||
|
||||
let resolved_cwd = {
|
||||
use std::env;
|
||||
|
||||
@@ -1183,6 +1210,7 @@ impl Config {
|
||||
did_user_set_custom_approval_policy_or_sandbox_mode,
|
||||
forced_auto_mode_downgraded_on_windows,
|
||||
shell_environment_policy,
|
||||
non_login_shell_allowlist,
|
||||
notify: cfg.notify,
|
||||
user_instructions,
|
||||
base_instructions,
|
||||
@@ -2963,6 +2991,10 @@ model_verbosity = "high"
|
||||
did_user_set_custom_approval_policy_or_sandbox_mode: true,
|
||||
forced_auto_mode_downgraded_on_windows: false,
|
||||
shell_environment_policy: ShellEnvironmentPolicy::default(),
|
||||
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect(),
|
||||
user_instructions: None,
|
||||
notify: None,
|
||||
cwd: fixture.cwd(),
|
||||
@@ -3037,6 +3069,10 @@ model_verbosity = "high"
|
||||
did_user_set_custom_approval_policy_or_sandbox_mode: true,
|
||||
forced_auto_mode_downgraded_on_windows: false,
|
||||
shell_environment_policy: ShellEnvironmentPolicy::default(),
|
||||
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect(),
|
||||
user_instructions: None,
|
||||
notify: None,
|
||||
cwd: fixture.cwd(),
|
||||
@@ -3126,6 +3162,10 @@ model_verbosity = "high"
|
||||
did_user_set_custom_approval_policy_or_sandbox_mode: true,
|
||||
forced_auto_mode_downgraded_on_windows: false,
|
||||
shell_environment_policy: ShellEnvironmentPolicy::default(),
|
||||
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect(),
|
||||
user_instructions: None,
|
||||
notify: None,
|
||||
cwd: fixture.cwd(),
|
||||
@@ -3201,6 +3241,10 @@ model_verbosity = "high"
|
||||
did_user_set_custom_approval_policy_or_sandbox_mode: true,
|
||||
forced_auto_mode_downgraded_on_windows: false,
|
||||
shell_environment_policy: ShellEnvironmentPolicy::default(),
|
||||
non_login_shell_allowlist: DEFAULT_NON_LOGIN_SHELL_ALLOWLIST
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect(),
|
||||
user_instructions: None,
|
||||
notify: None,
|
||||
cwd: fixture.cwd(),
|
||||
|
||||
@@ -459,6 +459,15 @@ pub struct ShellEnvironmentPolicyToml {
|
||||
pub experimental_use_profile: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, Default, PartialEq)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub struct ShellConfigToml {
|
||||
/// Additional commands that can skip login shells when the
|
||||
/// `non_login_shell_heuristic` feature flag is enabled.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub non_login_allowlist: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
pub type EnvironmentVariablePattern = WildMatchPattern<'*', '?'>;
|
||||
|
||||
/// Deriving the `env` based on this policy works as follows:
|
||||
|
||||
@@ -55,6 +55,8 @@ pub enum Feature {
|
||||
Skills,
|
||||
/// Send warnings to the model to correct it on the tool usage.
|
||||
ModelWarnings,
|
||||
/// Skip login shell for known-safe commands when enabled.
|
||||
NonLoginShellHeuristic,
|
||||
}
|
||||
|
||||
impl Feature {
|
||||
@@ -342,4 +344,10 @@ pub const FEATURES: &[FeatureSpec] = &[
|
||||
stage: Stage::Experimental,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::NonLoginShellHeuristic,
|
||||
key: "non_login_shell_heuristic",
|
||||
stage: Stage::Experimental,
|
||||
default_enabled: false,
|
||||
},
|
||||
];
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use crate::is_safe_command::is_known_safe_command;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use shlex::split as shlex_split;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
@@ -60,6 +63,51 @@ impl Shell {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn should_use_login_shell(
|
||||
requested_login_shell: bool,
|
||||
command: &str,
|
||||
shell: &Shell,
|
||||
heuristic_enabled: bool,
|
||||
allowlist: &[String],
|
||||
) -> bool {
|
||||
if !requested_login_shell {
|
||||
return false;
|
||||
}
|
||||
if !heuristic_enabled {
|
||||
return true;
|
||||
}
|
||||
if !matches!(
|
||||
shell.shell_type,
|
||||
ShellType::Bash | ShellType::Zsh | ShellType::Sh
|
||||
) {
|
||||
return requested_login_shell;
|
||||
}
|
||||
|
||||
let is_allowlisted = shlex_split(command)
|
||||
.and_then(|tokens| {
|
||||
tokens.first().map(|token| {
|
||||
let token_name = Path::new(token)
|
||||
.file_name()
|
||||
.and_then(|name| name.to_str())
|
||||
.unwrap_or(token.as_str());
|
||||
allowlist
|
||||
.iter()
|
||||
.any(|cmd| cmd == token || cmd == token_name)
|
||||
})
|
||||
})
|
||||
.unwrap_or(false);
|
||||
if !is_allowlisted {
|
||||
return true;
|
||||
}
|
||||
|
||||
let derived = shell.derive_exec_args(command, false);
|
||||
if is_known_safe_command(&derived) {
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
fn get_user_shell_path() -> Option<PathBuf> {
|
||||
use libc::getpwuid;
|
||||
@@ -359,7 +407,8 @@ mod tests {
|
||||
assert!(
|
||||
shell_path == PathBuf::from("/bin/bash")
|
||||
|| shell_path == PathBuf::from("/usr/bin/bash")
|
||||
|| shell_path == PathBuf::from("/usr/local/bin/bash"),
|
||||
|| shell_path == PathBuf::from("/usr/local/bin/bash")
|
||||
|| shell_path == PathBuf::from("/opt/homebrew/bin/bash"),
|
||||
"shell path: {shell_path:?}",
|
||||
);
|
||||
}
|
||||
@@ -451,4 +500,23 @@ mod tests {
|
||||
|
||||
assert!(shell_path.ends_with("pwsh.exe") || shell_path.ends_with("powershell.exe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_login_shell_heuristic_respects_allowlist() {
|
||||
let shell = Shell {
|
||||
shell_type: ShellType::Bash,
|
||||
shell_path: PathBuf::from("/bin/bash"),
|
||||
};
|
||||
let allowlist = vec!["ls".to_string()];
|
||||
assert!(!should_use_login_shell(
|
||||
true, "ls -la", &shell, true, &allowlist
|
||||
));
|
||||
assert!(should_use_login_shell(
|
||||
true,
|
||||
"awk '{print $1}' file.txt",
|
||||
&shell,
|
||||
true,
|
||||
&allowlist
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ use crate::protocol::ExecCommandSource;
|
||||
use crate::protocol::SandboxPolicy;
|
||||
use crate::protocol::TaskStartedEvent;
|
||||
use crate::sandboxing::ExecEnv;
|
||||
use crate::shell::should_use_login_shell;
|
||||
use crate::state::TaskKind;
|
||||
use crate::tools::format_exec_output_str;
|
||||
use crate::user_shell_command::user_shell_command_record_item;
|
||||
@@ -66,7 +67,13 @@ impl SessionTask for UserShellCommandTask {
|
||||
// Execute the user's script under their default shell when known; this
|
||||
// allows commands that use shell features (pipes, &&, redirects, etc.).
|
||||
// We do not source rc files or otherwise reformat the script.
|
||||
let use_login_shell = true;
|
||||
let use_login_shell = should_use_login_shell(
|
||||
true,
|
||||
&self.command,
|
||||
session.user_shell(),
|
||||
turn_context.non_login_shell_heuristic_enabled,
|
||||
&turn_context.non_login_shell_allowlist,
|
||||
);
|
||||
let command = session
|
||||
.user_shell()
|
||||
.derive_exec_args(&self.command, use_login_shell);
|
||||
|
||||
@@ -11,6 +11,7 @@ use crate::function_tool::FunctionCallError;
|
||||
use crate::is_safe_command::is_known_safe_command;
|
||||
use crate::protocol::ExecCommandSource;
|
||||
use crate::sandboxing::SandboxPermissions;
|
||||
use crate::shell::should_use_login_shell;
|
||||
use crate::tools::context::ToolInvocation;
|
||||
use crate::tools::context::ToolOutput;
|
||||
use crate::tools::context::ToolPayload;
|
||||
@@ -49,7 +50,13 @@ impl ShellCommandHandler {
|
||||
turn_context: &TurnContext,
|
||||
) -> ExecParams {
|
||||
let shell = session.user_shell();
|
||||
let use_login_shell = true;
|
||||
let use_login_shell = should_use_login_shell(
|
||||
true,
|
||||
¶ms.command,
|
||||
shell,
|
||||
turn_context.non_login_shell_heuristic_enabled,
|
||||
&turn_context.non_login_shell_allowlist,
|
||||
);
|
||||
let command = shell.derive_exec_args(¶ms.command, use_login_shell);
|
||||
|
||||
ExecParams {
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::protocol::ExecCommandSource;
|
||||
use crate::protocol::ExecOutputStream;
|
||||
use crate::shell::default_user_shell;
|
||||
use crate::shell::get_shell_by_model_provided_path;
|
||||
use crate::shell::should_use_login_shell;
|
||||
use crate::tools::context::ToolInvocation;
|
||||
use crate::tools::context::ToolOutput;
|
||||
use crate::tools::context::ToolPayload;
|
||||
@@ -93,7 +94,15 @@ impl ToolHandler for UnifiedExecHandler {
|
||||
let Ok(params) = serde_json::from_str::<ExecCommandArgs>(arguments) else {
|
||||
return true;
|
||||
};
|
||||
let command = get_command(¶ms);
|
||||
let shell = resolve_shell(¶ms);
|
||||
let use_login_shell = should_use_login_shell(
|
||||
params.login,
|
||||
¶ms.cmd,
|
||||
&shell,
|
||||
invocation.turn.non_login_shell_heuristic_enabled,
|
||||
&invocation.turn.non_login_shell_allowlist,
|
||||
);
|
||||
let command = shell.derive_exec_args(¶ms.cmd, use_login_shell);
|
||||
!is_known_safe_command(&command)
|
||||
}
|
||||
|
||||
@@ -130,7 +139,15 @@ impl ToolHandler for UnifiedExecHandler {
|
||||
})?;
|
||||
let process_id = manager.allocate_process_id().await;
|
||||
|
||||
let command = get_command(&args);
|
||||
let shell = resolve_shell(&args);
|
||||
let use_login_shell = should_use_login_shell(
|
||||
args.login,
|
||||
&args.cmd,
|
||||
&shell,
|
||||
turn.non_login_shell_heuristic_enabled,
|
||||
&turn.non_login_shell_allowlist,
|
||||
);
|
||||
let command = shell.derive_exec_args(&args.cmd, use_login_shell);
|
||||
let ExecCommandArgs {
|
||||
workdir,
|
||||
yield_time_ms,
|
||||
@@ -253,14 +270,18 @@ impl ToolHandler for UnifiedExecHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn get_command(args: &ExecCommandArgs) -> Vec<String> {
|
||||
let shell = if let Some(shell_str) = &args.shell {
|
||||
let shell = resolve_shell(args);
|
||||
shell.derive_exec_args(&args.cmd, args.login)
|
||||
}
|
||||
|
||||
fn resolve_shell(args: &ExecCommandArgs) -> crate::shell::Shell {
|
||||
if let Some(shell_str) = &args.shell {
|
||||
get_shell_by_model_provided_path(&PathBuf::from(shell_str))
|
||||
} else {
|
||||
default_user_shell()
|
||||
};
|
||||
|
||||
shell.derive_exec_args(&args.cmd, args.login)
|
||||
}
|
||||
}
|
||||
|
||||
fn format_response(response: &UnifiedExecResponse) -> String {
|
||||
|
||||
615
codex-rs/scripts/bench_codex_exec_non_login.py
Normal file
615
codex-rs/scripts/bench_codex_exec_non_login.py
Normal file
@@ -0,0 +1,615 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run codex exec end-to-end with and without the non_login_shell_heuristic feature.
|
||||
|
||||
This script repeatedly runs:
|
||||
- codex exec with the heuristic disabled
|
||||
- codex exec with the heuristic enabled
|
||||
|
||||
for one or more prompts and reports per-run wall times, per-prompt summaries,
|
||||
and combined summary statistics.
|
||||
|
||||
Runs can be parallelized (concurrency > 1) to speed up sampling, but note that
|
||||
parallel rollouts may contend for local/remote resources and slightly skew
|
||||
latency compared to strictly serial runs.
|
||||
|
||||
Usage:
|
||||
python scripts/bench_codex_exec_non_login.py \
|
||||
--iterations 5 \
|
||||
--prompt "Read and summarize codex_berry" \
|
||||
--prompt "Explain the architecture" \
|
||||
--workdir /path/to/repo \
|
||||
--model gpt-5.1-codex-max \
|
||||
--reasoning-effort high \
|
||||
--concurrency 2 \
|
||||
--codex-bin /path/to/codex \
|
||||
--skip-feature-toggle # when benchmarking an older codex binary
|
||||
|
||||
Notes:
|
||||
- Runs will incur network/LLM variance; prefer N >= 5–10.
|
||||
- Requires the `codex` binary (local path supported) and valid credentials.
|
||||
- Results are printed in seconds; failures are logged and included in the output.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import math
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunResult:
|
||||
duration: float
|
||||
exit_code: int
|
||||
stderr: str
|
||||
command_time: float
|
||||
command_count: int
|
||||
mcp_call_count: int
|
||||
usage: Optional["Usage"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Usage:
|
||||
input_tokens: int
|
||||
cached_input_tokens: int
|
||||
output_tokens: int
|
||||
|
||||
|
||||
async def run_once(
|
||||
base_cmd: List[str],
|
||||
prompt: str,
|
||||
feature_enabled: bool,
|
||||
toggle_feature: bool,
|
||||
sem: asyncio.Semaphore,
|
||||
) -> RunResult:
|
||||
cmd = list(base_cmd)
|
||||
if toggle_feature:
|
||||
cmd.extend(
|
||||
[
|
||||
"-c",
|
||||
f"features.non_login_shell_heuristic={'true' if feature_enabled else 'false'}",
|
||||
]
|
||||
)
|
||||
cmd.append(prompt)
|
||||
async with sem:
|
||||
start = time.perf_counter()
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
async def read_stdout() -> tuple[float, int, int, Optional[Usage]]:
|
||||
command_start_times: dict[str, float] = {}
|
||||
command_time = 0.0
|
||||
command_count = 0
|
||||
mcp_call_count = 0
|
||||
usage: Optional[Usage] = None
|
||||
buffer = b""
|
||||
|
||||
def handle_line(raw_line: bytes) -> None:
|
||||
nonlocal command_time, command_count, mcp_call_count, usage
|
||||
ts = time.perf_counter()
|
||||
line = raw_line.decode(errors="replace").strip()
|
||||
if not line:
|
||||
return
|
||||
try:
|
||||
event = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
return
|
||||
|
||||
event_type = event.get("type")
|
||||
if event_type == "turn.completed" and (usage_info := event.get("usage")):
|
||||
usage = Usage(
|
||||
input_tokens=usage_info.get("input_tokens", 0),
|
||||
cached_input_tokens=usage_info.get("cached_input_tokens", 0),
|
||||
output_tokens=usage_info.get("output_tokens", 0),
|
||||
)
|
||||
|
||||
if not event_type or not event_type.startswith("item."):
|
||||
return
|
||||
|
||||
item = event.get("item") or {}
|
||||
item_type = item.get("type")
|
||||
item_id = item.get("id")
|
||||
|
||||
if item_type == "command_execution":
|
||||
if event_type == "item.started" and item_id:
|
||||
command_start_times[item_id] = ts
|
||||
command_count += 1
|
||||
elif event_type == "item.completed" and item_id:
|
||||
if start_ts := command_start_times.pop(item_id, None):
|
||||
command_time += ts - start_ts
|
||||
elif item_type == "mcp_tool_call":
|
||||
if event_type == "item.started":
|
||||
mcp_call_count += 1
|
||||
|
||||
assert proc.stdout is not None
|
||||
try:
|
||||
while True:
|
||||
chunk = await proc.stdout.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
buffer += chunk
|
||||
lines = buffer.split(b"\n")
|
||||
buffer = lines.pop() if lines else b""
|
||||
for raw_line in lines:
|
||||
handle_line(raw_line)
|
||||
except asyncio.CancelledError:
|
||||
# Allow cancellation to propagate but keep what we have so far.
|
||||
raise
|
||||
finally:
|
||||
if buffer:
|
||||
handle_line(buffer)
|
||||
|
||||
return command_time, command_count, mcp_call_count, usage
|
||||
|
||||
stdout_task = asyncio.create_task(read_stdout())
|
||||
stderr_task = (
|
||||
asyncio.create_task(proc.stderr.read()) if proc.stderr is not None else None
|
||||
)
|
||||
|
||||
await proc.wait()
|
||||
stderr_bytes = await stderr_task if stderr_task is not None else b""
|
||||
command_time, command_count, mcp_call_count, usage = await stdout_task
|
||||
|
||||
duration = time.perf_counter() - start
|
||||
stderr_text = stderr_bytes.decode(errors="replace").strip() if stderr_bytes else ""
|
||||
return RunResult(
|
||||
duration=duration,
|
||||
exit_code=proc.returncode,
|
||||
stderr=stderr_text,
|
||||
command_time=command_time,
|
||||
command_count=command_count,
|
||||
mcp_call_count=mcp_call_count,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
|
||||
async def run_case(
|
||||
label: str,
|
||||
base_cmd: List[str],
|
||||
prompt: str,
|
||||
iterations: int,
|
||||
feature_enabled: bool,
|
||||
toggle_feature: bool,
|
||||
concurrency: int,
|
||||
) -> tuple[
|
||||
list[float], list[float], list[float], list[int], list[int], list[Usage], int
|
||||
]:
|
||||
sem = asyncio.Semaphore(concurrency)
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
run_once(base_cmd, prompt, feature_enabled, toggle_feature, sem)
|
||||
)
|
||||
for _ in range(iterations)
|
||||
]
|
||||
durations: List[float] = []
|
||||
command_times: List[float] = []
|
||||
command_times_per_cmd: List[float] = []
|
||||
command_counts: List[int] = []
|
||||
mcp_call_counts: List[int] = []
|
||||
usages: List[Usage] = []
|
||||
failures = 0
|
||||
for idx, task in enumerate(asyncio.as_completed(tasks), start=1):
|
||||
result = await task
|
||||
durations.append(result.duration)
|
||||
command_times.append(result.command_time)
|
||||
if result.command_count > 0:
|
||||
command_times_per_cmd.append(result.command_time / result.command_count)
|
||||
command_counts.append(result.command_count)
|
||||
mcp_call_counts.append(result.mcp_call_count)
|
||||
if result.usage:
|
||||
usages.append(result.usage)
|
||||
status = "ok" if result.exit_code == 0 else f"fail ({result.exit_code})"
|
||||
cmd_time_str = f" cmd_time={result.command_time:.3f}s cmds={result.command_count}"
|
||||
mcp_str = f" mcp_calls={result.mcp_call_count}" if result.mcp_call_count else ""
|
||||
print(
|
||||
f"[{label}] run {idx}/{iterations}: {result.duration:.3f}s"
|
||||
f" [{status}]{cmd_time_str}{mcp_str}"
|
||||
)
|
||||
if result.exit_code != 0:
|
||||
failures += 1
|
||||
if result.stderr:
|
||||
print(f" stderr: {result.stderr}", file=sys.stderr)
|
||||
return (
|
||||
durations,
|
||||
command_times,
|
||||
command_times_per_cmd,
|
||||
command_counts,
|
||||
mcp_call_counts,
|
||||
usages,
|
||||
failures,
|
||||
)
|
||||
|
||||
|
||||
def summarize(label: str, durations: List[float]) -> None:
|
||||
if not durations:
|
||||
print(f"[{label}] no runs recorded")
|
||||
return
|
||||
mean = statistics.mean(durations)
|
||||
median = statistics.median(durations)
|
||||
if len(durations) < 2:
|
||||
p95 = durations[0]
|
||||
else:
|
||||
p95 = statistics.quantiles(durations, n=100)[94]
|
||||
margin = confidence_margin(durations)
|
||||
print(
|
||||
f"[{label}] n={len(durations)} "
|
||||
f"mean={mean:.3f}s±{margin:.3f}s median={median:.3f}s p95={p95:.3f}s"
|
||||
)
|
||||
|
||||
|
||||
def summarize_command_time(label: str, command_times: List[float], command_counts: List[int]) -> None:
|
||||
if not command_times:
|
||||
print(f"[{label}] command time: no runs recorded")
|
||||
return
|
||||
mean = statistics.mean(command_times)
|
||||
median = statistics.median(command_times)
|
||||
p95 = command_times[0] if len(command_times) < 2 else statistics.quantiles(command_times, n=100)[94]
|
||||
avg_cmds = statistics.mean(command_counts) if command_counts else 0
|
||||
margin = confidence_margin(command_times)
|
||||
print(
|
||||
f"[{label}] command time n={len(command_times)} "
|
||||
f"mean={mean:.3f}s±{margin:.3f}s median={median:.3f}s p95={p95:.3f}s avg_cmds={avg_cmds:.2f}"
|
||||
)
|
||||
|
||||
|
||||
def summarize_command_time_per_command(label: str, command_times_per_cmd: List[float]) -> None:
|
||||
if not command_times_per_cmd:
|
||||
print(f"[{label}] command time per command: no runs recorded")
|
||||
return
|
||||
mean = statistics.mean(command_times_per_cmd)
|
||||
median = statistics.median(command_times_per_cmd)
|
||||
p95 = (
|
||||
command_times_per_cmd[0]
|
||||
if len(command_times_per_cmd) < 2
|
||||
else statistics.quantiles(command_times_per_cmd, n=100)[94]
|
||||
)
|
||||
margin = confidence_margin(command_times_per_cmd)
|
||||
print(
|
||||
f"[{label}] command time per command n={len(command_times_per_cmd)} "
|
||||
f"mean={mean:.3f}s±{margin:.3f}s median={median:.3f}s p95={p95:.3f}s"
|
||||
)
|
||||
|
||||
|
||||
def summarize_usage(label: str, usages: List[Usage]) -> None:
|
||||
if not usages:
|
||||
print(f"[{label}] tokens: no runs recorded")
|
||||
return
|
||||
avg_input = statistics.mean(u.input_tokens for u in usages)
|
||||
avg_cached = statistics.mean(u.cached_input_tokens for u in usages)
|
||||
avg_output = statistics.mean(u.output_tokens for u in usages)
|
||||
print(
|
||||
f"[{label}] tokens avg input={avg_input:.1f} cached={avg_cached:.1f} output={avg_output:.1f}"
|
||||
)
|
||||
|
||||
|
||||
def summarize_mcp_calls(label: str, mcp_counts: List[int]) -> None:
|
||||
if not mcp_counts:
|
||||
print(f"[{label}] mcp calls: no runs recorded")
|
||||
return
|
||||
avg_mcp = statistics.mean(mcp_counts)
|
||||
max_mcp = max(mcp_counts)
|
||||
print(f"[{label}] mcp calls avg={avg_mcp:.2f} max={max_mcp}")
|
||||
|
||||
|
||||
def format_prompt_label(prompt: str, idx: int) -> str:
|
||||
snippet = prompt.strip()
|
||||
if len(snippet) > 60:
|
||||
snippet = snippet[:57] + "..."
|
||||
return f"prompt {idx + 1}: {snippet}"
|
||||
|
||||
|
||||
def confidence_margin(values: List[float]) -> float:
|
||||
if len(values) < 2:
|
||||
return 0.0
|
||||
stdev = statistics.stdev(values)
|
||||
return 1.96 * stdev / math.sqrt(len(values))
|
||||
|
||||
|
||||
def stats(values: List[float]) -> dict:
|
||||
if not values:
|
||||
return {
|
||||
"n": 0,
|
||||
"mean": None,
|
||||
"median": None,
|
||||
"p95": None,
|
||||
"margin": None,
|
||||
}
|
||||
mean = statistics.mean(values)
|
||||
median = statistics.median(values)
|
||||
if len(values) < 2:
|
||||
p95 = values[0]
|
||||
else:
|
||||
p95 = statistics.quantiles(values, n=100)[94]
|
||||
margin = confidence_margin(values)
|
||||
return {"n": len(values), "mean": mean, "median": median, "p95": p95, "margin": margin}
|
||||
|
||||
|
||||
def safe_avg(values: List[float]) -> Optional[float]:
|
||||
return statistics.mean(values) if values else None
|
||||
|
||||
|
||||
def fmt_stat(s: dict) -> str:
|
||||
if not s["n"]:
|
||||
return "-"
|
||||
return f"{s['mean']:.3f}±{s['margin']:.3f}"
|
||||
|
||||
|
||||
def fmt_avg(value: Optional[float]) -> str:
|
||||
return f"{value:.2f}" if value is not None else "-"
|
||||
|
||||
|
||||
def fmt_int(value: Optional[int]) -> str:
|
||||
return str(value) if value is not None else "-"
|
||||
|
||||
|
||||
def render_table(headers: List[str], rows: List[List[str]]) -> None:
|
||||
widths = [len(h) for h in headers]
|
||||
for row in rows:
|
||||
for i, cell in enumerate(row):
|
||||
widths[i] = max(widths[i], len(cell))
|
||||
fmt = " ".join(f"{{:<{w}}}" for w in widths)
|
||||
print(fmt.format(*headers))
|
||||
print(fmt.format(*["-" * w for w in widths]))
|
||||
for row in rows:
|
||||
print(fmt.format(*row))
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark codex exec with/without non_login_shell_heuristic."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--codex-bin",
|
||||
default="codex",
|
||||
help="Path to the codex binary (default: codex on PATH).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--iterations",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of runs per configuration (default: 5)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt",
|
||||
dest="prompts",
|
||||
action="append",
|
||||
required=True,
|
||||
help="Prompt to send to codex exec (repeat for multiple prompts).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workdir",
|
||||
default=None,
|
||||
help="Working directory for codex exec (--cd); defaults to current dir.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=None,
|
||||
help="Optional model override passed to codex exec (--model).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reasoning-effort",
|
||||
choices=["none", "minimal", "low", "medium", "high", "xhigh"],
|
||||
default=None,
|
||||
help="Optional reasoning effort override passed via -c model_reasoning_effort=...",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extra-config",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Additional -c overrides passed to codex exec (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--concurrency",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of concurrent runs to launch per configuration (default: 1).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-feature-toggle",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Do not inject the non_login_shell_heuristic flag; "
|
||||
"use when benchmarking an older codex binary."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Print codex exec stdout/stderr for debugging.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
base_cmd: List[str] = [args.codex_bin, "exec"]
|
||||
if args.workdir:
|
||||
base_cmd.extend(["--cd", args.workdir])
|
||||
if args.model:
|
||||
base_cmd.extend(["--model", args.model])
|
||||
if args.reasoning_effort:
|
||||
base_cmd.extend(["-c", f"model_reasoning_effort={args.reasoning_effort}"])
|
||||
for override in args.extra_config:
|
||||
base_cmd.extend(["-c", override])
|
||||
base_cmd.append("--json")
|
||||
toggle_feature = not args.skip_feature_toggle
|
||||
prompts = args.prompts
|
||||
|
||||
print(
|
||||
f"Running {args.iterations} iterations per config for {len(prompts)} prompt(s)..."
|
||||
)
|
||||
all_off_durations: List[float] = []
|
||||
all_on_durations: List[float] = []
|
||||
all_off_command_times: List[float] = []
|
||||
all_on_command_times: List[float] = []
|
||||
all_off_command_times_per_cmd: List[float] = []
|
||||
all_on_command_times_per_cmd: List[float] = []
|
||||
all_off_command_counts: List[int] = []
|
||||
all_on_command_counts: List[int] = []
|
||||
all_off_mcp_calls: List[int] = []
|
||||
all_on_mcp_calls: List[int] = []
|
||||
all_off_usages: List[Usage] = []
|
||||
all_on_usages: List[Usage] = []
|
||||
total_off_failures = 0
|
||||
total_on_failures = 0
|
||||
per_prompt_results: list[dict] = []
|
||||
|
||||
for idx, prompt in enumerate(prompts):
|
||||
label = format_prompt_label(prompt, idx)
|
||||
print(f"\nPrompt {idx + 1}/{len(prompts)}: {label}")
|
||||
(
|
||||
off_durations,
|
||||
off_cmd_times,
|
||||
off_cmd_times_per_cmd,
|
||||
off_cmd_counts,
|
||||
off_mcp_calls,
|
||||
off_usages,
|
||||
off_failures,
|
||||
) = asyncio.run(
|
||||
run_case(
|
||||
f"login-shell | {label}",
|
||||
base_cmd,
|
||||
prompt,
|
||||
args.iterations,
|
||||
feature_enabled=False,
|
||||
toggle_feature=toggle_feature,
|
||||
concurrency=args.concurrency,
|
||||
)
|
||||
)
|
||||
(
|
||||
on_durations,
|
||||
on_cmd_times,
|
||||
on_cmd_times_per_cmd,
|
||||
on_cmd_counts,
|
||||
on_mcp_calls,
|
||||
on_usages,
|
||||
on_failures,
|
||||
) = asyncio.run(
|
||||
run_case(
|
||||
f"non-login-shell | {label}",
|
||||
base_cmd,
|
||||
prompt,
|
||||
args.iterations,
|
||||
feature_enabled=True,
|
||||
toggle_feature=toggle_feature,
|
||||
concurrency=args.concurrency,
|
||||
)
|
||||
)
|
||||
per_prompt_results.append(
|
||||
{
|
||||
"label": label,
|
||||
"off_durations": off_durations,
|
||||
"on_durations": on_durations,
|
||||
"off_cmd_times": off_cmd_times,
|
||||
"on_cmd_times": on_cmd_times,
|
||||
"off_cmd_times_per_cmd": off_cmd_times_per_cmd,
|
||||
"on_cmd_times_per_cmd": on_cmd_times_per_cmd,
|
||||
"off_cmd_counts": off_cmd_counts,
|
||||
"on_cmd_counts": on_cmd_counts,
|
||||
"off_mcp_calls": off_mcp_calls,
|
||||
"on_mcp_calls": on_mcp_calls,
|
||||
"off_usages": off_usages,
|
||||
"on_usages": on_usages,
|
||||
"off_failures": off_failures,
|
||||
"on_failures": on_failures,
|
||||
}
|
||||
)
|
||||
all_off_durations.extend(off_durations)
|
||||
all_on_durations.extend(on_durations)
|
||||
all_off_command_times.extend(off_cmd_times)
|
||||
all_on_command_times.extend(on_cmd_times)
|
||||
all_off_command_times_per_cmd.extend(off_cmd_times_per_cmd)
|
||||
all_on_command_times_per_cmd.extend(on_cmd_times_per_cmd)
|
||||
all_off_command_counts.extend(off_cmd_counts)
|
||||
all_on_command_counts.extend(on_cmd_counts)
|
||||
all_off_mcp_calls.extend(off_mcp_calls)
|
||||
all_on_mcp_calls.extend(on_mcp_calls)
|
||||
all_off_usages.extend(off_usages)
|
||||
all_on_usages.extend(on_usages)
|
||||
total_off_failures += off_failures
|
||||
total_on_failures += on_failures
|
||||
|
||||
print("\nPer-prompt summary (means ±95% CI):")
|
||||
per_prompt_rows: List[List[str]] = []
|
||||
for s in per_prompt_results:
|
||||
per_prompt_rows.append(
|
||||
[
|
||||
s["label"],
|
||||
fmt_stat(stats(s["off_durations"])),
|
||||
fmt_stat(stats(s["on_durations"])),
|
||||
fmt_stat(stats(s["off_cmd_times"])),
|
||||
fmt_stat(stats(s["on_cmd_times"])),
|
||||
fmt_stat(stats(s["off_cmd_times_per_cmd"])),
|
||||
fmt_stat(stats(s["on_cmd_times_per_cmd"])),
|
||||
fmt_avg(safe_avg(s["off_cmd_counts"])),
|
||||
fmt_avg(safe_avg(s["on_cmd_counts"])),
|
||||
fmt_avg(safe_avg(s["off_mcp_calls"])),
|
||||
fmt_avg(safe_avg(s["on_mcp_calls"])),
|
||||
]
|
||||
)
|
||||
|
||||
render_table(
|
||||
[
|
||||
"prompt",
|
||||
"wall login",
|
||||
"wall non",
|
||||
"cmd login",
|
||||
"cmd non",
|
||||
"cmd/call login",
|
||||
"cmd/call non",
|
||||
"avg cmds login",
|
||||
"avg cmds non",
|
||||
"avg mcp login",
|
||||
"avg mcp non",
|
||||
],
|
||||
per_prompt_rows,
|
||||
)
|
||||
|
||||
print("\nCombined summary (means ±95% CI):")
|
||||
combined_rows = [
|
||||
[
|
||||
"all prompts",
|
||||
fmt_stat(stats(all_off_durations)),
|
||||
fmt_stat(stats(all_on_durations)),
|
||||
fmt_stat(stats(all_off_command_times)),
|
||||
fmt_stat(stats(all_on_command_times)),
|
||||
fmt_stat(stats(all_off_command_times_per_cmd)),
|
||||
fmt_stat(stats(all_on_command_times_per_cmd)),
|
||||
fmt_avg(safe_avg(all_off_command_counts)),
|
||||
fmt_avg(safe_avg(all_on_command_counts)),
|
||||
fmt_avg(safe_avg(all_off_mcp_calls)),
|
||||
fmt_avg(safe_avg(all_on_mcp_calls)),
|
||||
]
|
||||
]
|
||||
render_table(
|
||||
[
|
||||
"scope",
|
||||
"wall login",
|
||||
"wall non",
|
||||
"cmd login",
|
||||
"cmd non",
|
||||
"cmd/call login",
|
||||
"cmd/call non",
|
||||
"avg cmds login",
|
||||
"avg cmds non",
|
||||
"avg mcp login",
|
||||
"avg mcp non",
|
||||
],
|
||||
combined_rows,
|
||||
)
|
||||
|
||||
print(f"\nlogin-shell failures (all prompts): {total_off_failures}")
|
||||
print(f"non-login-shell failures (all prompts): {total_on_failures}")
|
||||
|
||||
return 0 if (total_off_failures + total_on_failures) == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -48,6 +48,7 @@ Supported features:
|
||||
| `web_search_request` | false | Stable | Allow the model to issue web searches |
|
||||
| `experimental_sandbox_command_assessment` | false | Experimental | Enable model-based sandbox risk assessment |
|
||||
| `ghost_commit` | false | Experimental | Create a ghost commit each turn |
|
||||
| `non_login_shell_heuristic` | false | Experimental | Skip login shells for allow-listed safe commands |
|
||||
| `enable_experimental_windows_sandbox` | false | Experimental | Use the Windows restricted-token sandbox |
|
||||
|
||||
Notes:
|
||||
@@ -416,6 +417,28 @@ set = { PATH = "/usr/bin", MY_FLAG = "1" }
|
||||
|
||||
Currently, `CODEX_SANDBOX_NETWORK_DISABLED=1` is also added to the environment, assuming network is disabled. This is not configurable.
|
||||
|
||||
### shell.non_login_allowlist
|
||||
|
||||
When the `non_login_shell_heuristic` feature flag is enabled, Codex will skip `-l`
|
||||
for obviously safe commands (e.g., `ls`, `rg --files`) to avoid loading login
|
||||
profiles on every exec. You can extend the allow-list via `config.toml`:
|
||||
|
||||
```toml
|
||||
[shell]
|
||||
non_login_allowlist = ["ls", "rg", "git", "cat", "rm", "rmdir"]
|
||||
```
|
||||
|
||||
The defaults include common read-only commands (`ls`, `cat`, `rg`, `git status`,
|
||||
`sed -n`, `rm <file>`, `rmdir <dir>`, etc.); entries you add are merged on top.
|
||||
The heuristic only applies when the feature flag is enabled, and it keeps using
|
||||
login shells for commands that are not known-safe.
|
||||
|
||||
You can compare login vs non-login startup costs locally via:
|
||||
|
||||
```bash
|
||||
cargo run -p codex-core --example non_login_shell_bench -- --iterations 5
|
||||
```
|
||||
|
||||
## MCP integration
|
||||
|
||||
### mcp_servers
|
||||
|
||||
Reference in New Issue
Block a user