mirror of
https://github.com/openai/codex.git
synced 2026-04-30 01:16:54 +00:00
Migrate coverage to shell_command (#7042)
This commit is contained in:
@@ -17,15 +17,11 @@ use core_test_support::wait_for_event;
|
||||
use regex_lite::Regex;
|
||||
use serde_json::json;
|
||||
|
||||
/// Integration test: spawn a long‑running shell tool via a mocked Responses SSE
|
||||
/// Integration test: spawn a long‑running shell_command tool via a mocked Responses SSE
|
||||
/// function call, then interrupt the session and expect TurnAborted.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn interrupt_long_running_tool_emits_turn_aborted() {
|
||||
let command = vec![
|
||||
"bash".to_string(),
|
||||
"-lc".to_string(),
|
||||
"sleep 60".to_string(),
|
||||
];
|
||||
let command = "sleep 60";
|
||||
|
||||
let args = json!({
|
||||
"command": command,
|
||||
@@ -33,14 +29,19 @@ async fn interrupt_long_running_tool_emits_turn_aborted() {
|
||||
})
|
||||
.to_string();
|
||||
let body = sse(vec![
|
||||
ev_function_call("call_sleep", "shell", &args),
|
||||
ev_function_call("call_sleep", "shell_command", &args),
|
||||
ev_completed("done"),
|
||||
]);
|
||||
|
||||
let server = start_mock_server().await;
|
||||
mount_sse_once(&server, body).await;
|
||||
|
||||
let codex = test_codex().build(&server).await.unwrap().codex;
|
||||
let codex = test_codex()
|
||||
.with_model("gpt-5.1")
|
||||
.build(&server)
|
||||
.await
|
||||
.unwrap()
|
||||
.codex;
|
||||
|
||||
// Kick off a turn that triggers the function call.
|
||||
codex
|
||||
@@ -67,11 +68,7 @@ async fn interrupt_long_running_tool_emits_turn_aborted() {
|
||||
/// responses server, and ensures the model receives the synthesized abort.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn interrupt_tool_records_history_entries() {
|
||||
let command = vec![
|
||||
"bash".to_string(),
|
||||
"-lc".to_string(),
|
||||
"sleep 60".to_string(),
|
||||
];
|
||||
let command = "sleep 60";
|
||||
let call_id = "call-history";
|
||||
|
||||
let args = json!({
|
||||
@@ -81,7 +78,7 @@ async fn interrupt_tool_records_history_entries() {
|
||||
.to_string();
|
||||
let first_body = sse(vec![
|
||||
ev_response_created("resp-history"),
|
||||
ev_function_call(call_id, "shell", &args),
|
||||
ev_function_call(call_id, "shell_command", &args),
|
||||
ev_completed("resp-history"),
|
||||
]);
|
||||
let follow_up_body = sse(vec![
|
||||
@@ -92,7 +89,11 @@ async fn interrupt_tool_records_history_entries() {
|
||||
let server = start_mock_server().await;
|
||||
let response_mock = mount_sse_sequence(&server, vec![first_body, follow_up_body]).await;
|
||||
|
||||
let fixture = test_codex().build(&server).await.unwrap();
|
||||
let fixture = test_codex()
|
||||
.with_model("gpt-5.1")
|
||||
.build(&server)
|
||||
.await
|
||||
.unwrap();
|
||||
let codex = Arc::clone(&fixture.codex);
|
||||
|
||||
codex
|
||||
|
||||
@@ -667,7 +667,7 @@ async fn apply_patch_cli_verification_failure_has_no_side_effects(
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn apply_patch_shell_heredoc_with_cd_updates_relative_workdir() -> Result<()> {
|
||||
async fn apply_patch_shell_command_heredoc_with_cd_updates_relative_workdir() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let harness = apply_patch_harness_with(|config| {
|
||||
@@ -684,14 +684,11 @@ async fn apply_patch_shell_heredoc_with_cd_updates_relative_workdir() -> Result<
|
||||
|
||||
let script = "cd sub && apply_patch <<'EOF'\n*** Begin Patch\n*** Update File: in_sub.txt\n@@\n-before\n+after\n*** End Patch\nEOF\n";
|
||||
let call_id = "shell-heredoc-cd";
|
||||
let args = json!({
|
||||
"command": ["bash", "-lc", script],
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
let args = json!({ "command": script, "timeout_ms": 5_000 });
|
||||
let bodies = vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse(vec![
|
||||
@@ -706,14 +703,14 @@ async fn apply_patch_shell_heredoc_with_cd_updates_relative_workdir() -> Result<
|
||||
let out = harness.function_call_stdout(call_id).await;
|
||||
assert!(
|
||||
out.contains("Success."),
|
||||
"expected successful apply_patch invocation via shell: {out}"
|
||||
"expected successful apply_patch invocation via shell_command: {out}"
|
||||
);
|
||||
assert_eq!(fs::read_to_string(&target)?, "after\n");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn apply_patch_shell_failure_propagates_error_and_skips_diff() -> Result<()> {
|
||||
async fn apply_patch_shell_command_failure_propagates_error_and_skips_diff() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let harness = apply_patch_harness_with(|config| {
|
||||
@@ -730,14 +727,11 @@ async fn apply_patch_shell_failure_propagates_error_and_skips_diff() -> Result<(
|
||||
|
||||
let script = "apply_patch <<'EOF'\n*** Begin Patch\n*** Update File: invalid.txt\n@@\n-nope\n+changed\n*** End Patch\nEOF\n";
|
||||
let call_id = "shell-apply-failure";
|
||||
let args = json!({
|
||||
"command": ["bash", "-lc", script],
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
let args = json!({ "command": script, "timeout_ms": 5_000 });
|
||||
let bodies = vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse(vec![
|
||||
@@ -780,10 +774,6 @@ async fn apply_patch_shell_failure_propagates_error_and_skips_diff() -> Result<(
|
||||
);
|
||||
|
||||
let out = harness.function_call_stdout(call_id).await;
|
||||
assert!(
|
||||
out.contains("apply_patch verification failed"),
|
||||
"expected verification failure message"
|
||||
);
|
||||
assert!(
|
||||
out.contains("Failed to find expected lines in"),
|
||||
"expected failure diagnostics: {out}"
|
||||
|
||||
@@ -71,7 +71,7 @@ enum ActionKind {
|
||||
response_body: &'static str,
|
||||
},
|
||||
RunCommand {
|
||||
command: &'static [&'static str],
|
||||
command: &'static str,
|
||||
},
|
||||
RunUnifiedExecCommand {
|
||||
command: &'static str,
|
||||
@@ -97,20 +97,12 @@ impl ActionKind {
|
||||
server: &MockServer,
|
||||
call_id: &str,
|
||||
with_escalated_permissions: bool,
|
||||
) -> Result<(Value, Option<Vec<String>>)> {
|
||||
) -> Result<(Value, Option<String>)> {
|
||||
match self {
|
||||
ActionKind::WriteFile { target, content } => {
|
||||
let (path, _) = target.resolve_for_patch(test);
|
||||
let _ = fs::remove_file(&path);
|
||||
let command = vec![
|
||||
"/bin/sh".to_string(),
|
||||
"-c".to_string(),
|
||||
format!(
|
||||
"printf {content:?} > {path:?} && cat {path:?}",
|
||||
content = content,
|
||||
path = path
|
||||
),
|
||||
];
|
||||
let command = format!("printf {content:?} > {path:?} && cat {path:?}");
|
||||
let event = shell_event(call_id, &command, 1_000, with_escalated_permissions)?;
|
||||
Ok((event, Some(command)))
|
||||
}
|
||||
@@ -127,21 +119,18 @@ impl ActionKind {
|
||||
.await;
|
||||
|
||||
let url = format!("{}{}", server.uri(), endpoint);
|
||||
let escaped_url = url.replace('\'', "\\'");
|
||||
let script = format!(
|
||||
"import sys\nimport urllib.request\nurl = {url:?}\ntry:\n data = urllib.request.urlopen(url, timeout=2).read().decode()\n print('OK:' + data.strip())\nexcept Exception as exc:\n print('ERR:' + exc.__class__.__name__)\n sys.exit(1)",
|
||||
"import sys\nimport urllib.request\nurl = '{escaped_url}'\ntry:\n data = urllib.request.urlopen(url, timeout=2).read().decode()\n print('OK:' + data.strip())\nexcept Exception as exc:\n print('ERR:' + exc.__class__.__name__)\n sys.exit(1)",
|
||||
);
|
||||
|
||||
let command = vec!["python3".to_string(), "-c".to_string(), script];
|
||||
let command = format!("python3 -c \"{script}\"");
|
||||
let event = shell_event(call_id, &command, 1_000, with_escalated_permissions)?;
|
||||
Ok((event, Some(command)))
|
||||
}
|
||||
ActionKind::RunCommand { command } => {
|
||||
let command: Vec<String> = command
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect();
|
||||
let event = shell_event(call_id, &command, 1_000, with_escalated_permissions)?;
|
||||
Ok((event, Some(command)))
|
||||
let event = shell_event(call_id, command, 1_000, with_escalated_permissions)?;
|
||||
Ok((event, Some(command.to_string())))
|
||||
}
|
||||
ActionKind::RunUnifiedExecCommand {
|
||||
command,
|
||||
@@ -154,14 +143,7 @@ impl ActionKind {
|
||||
with_escalated_permissions,
|
||||
*justification,
|
||||
)?;
|
||||
Ok((
|
||||
event,
|
||||
Some(vec![
|
||||
"/bin/bash".to_string(),
|
||||
"-lc".to_string(),
|
||||
command.to_string(),
|
||||
]),
|
||||
))
|
||||
Ok((event, Some(command.to_string())))
|
||||
}
|
||||
ActionKind::ApplyPatchFunction { target, content } => {
|
||||
let (path, patch_path) = target.resolve_for_patch(test);
|
||||
@@ -185,19 +167,19 @@ fn build_add_file_patch(patch_path: &str, content: &str) -> String {
|
||||
format!("*** Begin Patch\n*** Add File: {patch_path}\n+{content}\n*** End Patch\n")
|
||||
}
|
||||
|
||||
fn shell_apply_patch_command(patch: &str) -> Vec<String> {
|
||||
fn shell_apply_patch_command(patch: &str) -> String {
|
||||
let mut script = String::from("apply_patch <<'PATCH'\n");
|
||||
script.push_str(patch);
|
||||
if !patch.ends_with('\n') {
|
||||
script.push('\n');
|
||||
}
|
||||
script.push_str("PATCH\n");
|
||||
vec!["bash".to_string(), "-lc".to_string(), script]
|
||||
script
|
||||
}
|
||||
|
||||
fn shell_event(
|
||||
call_id: &str,
|
||||
command: &[String],
|
||||
command: &str,
|
||||
timeout_ms: u64,
|
||||
with_escalated_permissions: bool,
|
||||
) -> Result<Value> {
|
||||
@@ -209,7 +191,7 @@ fn shell_event(
|
||||
args["with_escalated_permissions"] = json!(true);
|
||||
}
|
||||
let args_str = serde_json::to_string(&args)?;
|
||||
Ok(ev_function_call(call_id, "shell", &args_str))
|
||||
Ok(ev_function_call(call_id, "shell_command", &args_str))
|
||||
}
|
||||
|
||||
fn exec_command_event(
|
||||
@@ -296,7 +278,10 @@ impl Expectation {
|
||||
}
|
||||
Expectation::FileCreatedNoExitCode { target, content } => {
|
||||
let (path, _) = target.resolve_for_patch(test);
|
||||
assert_eq!(result.exit_code, None, "expected no exit code for {path:?}");
|
||||
assert!(
|
||||
result.exit_code.is_none() || result.exit_code == Some(0),
|
||||
"expected no exit code for {path:?}",
|
||||
);
|
||||
assert!(
|
||||
result.stdout.contains(content),
|
||||
"stdout missing {content:?}: {}",
|
||||
@@ -385,8 +370,8 @@ impl Expectation {
|
||||
);
|
||||
}
|
||||
Expectation::NetworkSuccessNoExitCode { body_contains } => {
|
||||
assert_eq!(
|
||||
result.exit_code, None,
|
||||
assert!(
|
||||
result.exit_code.is_none() || result.exit_code == Some(0),
|
||||
"expected no exit code for successful network call: {}",
|
||||
result.stdout
|
||||
);
|
||||
@@ -433,8 +418,8 @@ impl Expectation {
|
||||
);
|
||||
}
|
||||
Expectation::CommandSuccessNoExitCode { stdout_contains } => {
|
||||
assert_eq!(
|
||||
result.exit_code, None,
|
||||
assert!(
|
||||
result.exit_code.is_none() || result.exit_code == Some(0),
|
||||
"expected no exit code for trusted command: {}",
|
||||
result.stdout
|
||||
);
|
||||
@@ -531,10 +516,18 @@ fn parse_result(item: &Value) -> CommandResult {
|
||||
CommandResult { exit_code, stdout }
|
||||
}
|
||||
Err(_) => {
|
||||
let structured = Regex::new(r"(?s)^Exit code:\s*(-?\d+).*?Output:\n(.*)$").unwrap();
|
||||
let regex =
|
||||
Regex::new(r"(?s)^.*?Process exited with code (\d+)\n.*?Output:\n(.*)$").unwrap();
|
||||
// parse freeform output
|
||||
if let Some(captures) = regex.captures(output_str) {
|
||||
if let Some(captures) = structured.captures(output_str) {
|
||||
let exit_code = captures.get(1).unwrap().as_str().parse::<i64>().unwrap();
|
||||
let output = captures.get(2).unwrap().as_str();
|
||||
CommandResult {
|
||||
exit_code: Some(exit_code),
|
||||
stdout: output.to_string(),
|
||||
}
|
||||
} else if let Some(captures) = regex.captures(output_str) {
|
||||
let exit_code = captures.get(1).unwrap().as_str().parse::<i64>().unwrap();
|
||||
let output = captures.get(2).unwrap().as_str();
|
||||
CommandResult {
|
||||
@@ -553,7 +546,7 @@ fn parse_result(item: &Value) -> CommandResult {
|
||||
|
||||
async fn expect_exec_approval(
|
||||
test: &TestCodex,
|
||||
expected_command: &[String],
|
||||
expected_command: &str,
|
||||
) -> ExecApprovalRequestEvent {
|
||||
let event = wait_for_event(&test.codex, |event| {
|
||||
matches!(
|
||||
@@ -565,7 +558,12 @@ async fn expect_exec_approval(
|
||||
|
||||
match event {
|
||||
EventMsg::ExecApprovalRequest(approval) => {
|
||||
assert_eq!(approval.command, expected_command);
|
||||
let last_arg = approval
|
||||
.command
|
||||
.last()
|
||||
.map(std::string::String::as_str)
|
||||
.unwrap_or_default();
|
||||
assert_eq!(last_arg, expected_command);
|
||||
approval
|
||||
}
|
||||
EventMsg::TaskComplete(_) => panic!("expected approval request before completion"),
|
||||
@@ -660,7 +658,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
features: vec![],
|
||||
model_override: Some("gpt-5.1"),
|
||||
outcome: Outcome::Auto,
|
||||
expectation: Expectation::FileCreatedNoExitCode {
|
||||
expectation: Expectation::FileCreated {
|
||||
target: TargetPath::OutsideWorkspace("dfa_on_request_5_1.txt"),
|
||||
content: "danger-on-request",
|
||||
},
|
||||
@@ -702,7 +700,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
approval_policy: UnlessTrusted,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
action: ActionKind::RunCommand {
|
||||
command: &["echo", "trusted-unless"],
|
||||
command: "echo trusted-unless",
|
||||
},
|
||||
with_escalated_permissions: false,
|
||||
features: vec![],
|
||||
@@ -717,7 +715,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
approval_policy: UnlessTrusted,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
action: ActionKind::RunCommand {
|
||||
command: &["echo", "trusted-unless"],
|
||||
command: "echo trusted-unless",
|
||||
},
|
||||
with_escalated_permissions: false,
|
||||
features: vec![],
|
||||
@@ -880,7 +878,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
approval_policy: OnRequest,
|
||||
sandbox_policy: SandboxPolicy::ReadOnly,
|
||||
action: ActionKind::RunCommand {
|
||||
command: &["echo", "trusted-read-only"],
|
||||
command: "echo trusted-read-only",
|
||||
},
|
||||
with_escalated_permissions: false,
|
||||
features: vec![],
|
||||
@@ -895,7 +893,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
approval_policy: OnRequest,
|
||||
sandbox_policy: SandboxPolicy::ReadOnly,
|
||||
action: ActionKind::RunCommand {
|
||||
command: &["echo", "trusted-read-only"],
|
||||
command: "echo trusted-read-only",
|
||||
},
|
||||
with_escalated_permissions: false,
|
||||
features: vec![],
|
||||
@@ -1020,7 +1018,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
},
|
||||
},
|
||||
ScenarioSpec {
|
||||
name: "apply_patch_shell_requires_patch_approval",
|
||||
name: "apply_patch_shell_command_requires_patch_approval",
|
||||
approval_policy: UnlessTrusted,
|
||||
sandbox_policy: workspace_write(false),
|
||||
action: ActionKind::ApplyPatchShell {
|
||||
@@ -1114,7 +1112,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
},
|
||||
},
|
||||
ScenarioSpec {
|
||||
name: "apply_patch_shell_outside_requires_patch_approval",
|
||||
name: "apply_patch_shell_command_outside_requires_patch_approval",
|
||||
approval_policy: OnRequest,
|
||||
sandbox_policy: workspace_write(false),
|
||||
action: ActionKind::ApplyPatchShell {
|
||||
@@ -1229,7 +1227,10 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
message_contains: if cfg!(target_os = "linux") {
|
||||
&["Permission denied"]
|
||||
} else {
|
||||
&["Permission denied|Operation not permitted|Read-only file system"]
|
||||
&[
|
||||
"Permission denied|Operation not permitted|operation not permitted|\
|
||||
Read-only file system",
|
||||
]
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -1238,7 +1239,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
approval_policy: Never,
|
||||
sandbox_policy: SandboxPolicy::ReadOnly,
|
||||
action: ActionKind::RunCommand {
|
||||
command: &["echo", "trusted-never"],
|
||||
command: "echo trusted-never",
|
||||
},
|
||||
with_escalated_permissions: false,
|
||||
features: vec![],
|
||||
@@ -1373,7 +1374,10 @@ fn scenarios() -> Vec<ScenarioSpec> {
|
||||
message_contains: if cfg!(target_os = "linux") {
|
||||
&["Permission denied"]
|
||||
} else {
|
||||
&["Permission denied|Operation not permitted|Read-only file system"]
|
||||
&[
|
||||
"Permission denied|Operation not permitted|operation not permitted|\
|
||||
Read-only file system",
|
||||
]
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -1509,7 +1513,7 @@ async fn run_scenario(scenario: &ScenarioSpec) -> Result<()> {
|
||||
expected_reason,
|
||||
} => {
|
||||
let command = expected_command
|
||||
.as_ref()
|
||||
.as_deref()
|
||||
.expect("exec approval requires shell command");
|
||||
let approval = expect_exec_approval(&test, command).await;
|
||||
if let Some(expected_reason) = expected_reason {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use codex_core::model_family::find_family_for_model;
|
||||
use codex_core::protocol::AskForApproval;
|
||||
use codex_core::protocol::EventMsg;
|
||||
use codex_core::protocol::Op;
|
||||
@@ -25,17 +26,17 @@ use pretty_assertions::assert_eq;
|
||||
async fn codex_delegate_forwards_exec_approval_and_proceeds_on_approval() {
|
||||
skip_if_no_network!();
|
||||
|
||||
// Sub-agent turn 1: emit a shell function_call requiring approval, then complete.
|
||||
// Sub-agent turn 1: emit a shell_command function_call requiring approval, then complete.
|
||||
let call_id = "call-exec-1";
|
||||
let args = serde_json::json!({
|
||||
"command": ["bash", "-lc", "rm -rf delegated"],
|
||||
"command": "rm -rf delegated",
|
||||
"timeout_ms": 1000,
|
||||
"with_escalated_permissions": true,
|
||||
})
|
||||
.to_string();
|
||||
let sse1 = sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &args),
|
||||
ev_function_call(call_id, "shell_command", &args),
|
||||
ev_completed("resp-1"),
|
||||
]);
|
||||
|
||||
@@ -61,6 +62,8 @@ async fn codex_delegate_forwards_exec_approval_and_proceeds_on_approval() {
|
||||
let mut builder = test_codex().with_config(|config| {
|
||||
config.approval_policy = AskForApproval::OnRequest;
|
||||
config.sandbox_policy = SandboxPolicy::ReadOnly;
|
||||
config.model = "gpt-5.1".to_string();
|
||||
config.model_family = find_family_for_model("gpt-5.1").expect("gpt-5.1 is a valid model");
|
||||
});
|
||||
let test = builder.build(&server).await.expect("build test codex");
|
||||
|
||||
@@ -138,6 +141,8 @@ async fn codex_delegate_forwards_patch_approval_and_proceeds_on_decision() {
|
||||
// Use a restricted sandbox so patch approval is required
|
||||
config.sandbox_policy = SandboxPolicy::ReadOnly;
|
||||
config.include_apply_patch_tool = true;
|
||||
config.model = "gpt-5.1".to_string();
|
||||
config.model_family = find_family_for_model("gpt-5.1").expect("gpt-5.1 is a valid model");
|
||||
});
|
||||
let test = builder.build(&server).await.expect("build test codex");
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#![allow(clippy::unwrap_used, clippy::expect_used)]
|
||||
|
||||
use anyhow::Result;
|
||||
use codex_core::model_family::find_family_for_model;
|
||||
use codex_core::protocol::AskForApproval;
|
||||
use codex_core::protocol::EventMsg;
|
||||
use codex_core::protocol::Op;
|
||||
@@ -21,6 +22,11 @@ use std::fs;
|
||||
|
||||
#[tokio::test]
|
||||
async fn execpolicy_blocks_shell_invocation() -> Result<()> {
|
||||
// TODO execpolicy doesn't parse powershell commands yet
|
||||
if cfg!(windows) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut builder = test_codex().with_config(|config| {
|
||||
let policy_path = config.codex_home.join("policy").join("policy.codexpolicy");
|
||||
fs::create_dir_all(
|
||||
@@ -34,13 +40,16 @@ async fn execpolicy_blocks_shell_invocation() -> Result<()> {
|
||||
r#"prefix_rule(pattern=["echo"], decision="forbidden")"#,
|
||||
)
|
||||
.expect("write policy file");
|
||||
config.model = "gpt-5.1".to_string();
|
||||
config.model_family =
|
||||
find_family_for_model("gpt-5.1").expect("gpt-5.1 should have a model family");
|
||||
});
|
||||
let server = start_mock_server().await;
|
||||
let test = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-forbidden";
|
||||
let args = json!({
|
||||
"command": ["echo", "blocked"],
|
||||
"command": "echo blocked",
|
||||
"timeout_ms": 1_000,
|
||||
});
|
||||
|
||||
@@ -48,7 +57,7 @@ async fn execpolicy_blocks_shell_invocation() -> Result<()> {
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
|
||||
@@ -146,10 +146,11 @@ async fn non_parallel_tools_run_serially() -> anyhow::Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let test = test_codex().build(&server).await?;
|
||||
let mut builder = test_codex().with_model("gpt-5.1");
|
||||
let test = builder.build(&server).await?;
|
||||
|
||||
let shell_args = json!({
|
||||
"command": ["/bin/sh", "-c", "sleep 0.3"],
|
||||
"command": "sleep 0.3",
|
||||
"timeout_ms": 1_000,
|
||||
});
|
||||
let args_one = serde_json::to_string(&shell_args)?;
|
||||
@@ -157,8 +158,8 @@ async fn non_parallel_tools_run_serially() -> anyhow::Result<()> {
|
||||
|
||||
let first_response = sse(vec![
|
||||
json!({"type": "response.created", "response": {"id": "resp-1"}}),
|
||||
ev_function_call("call-1", "shell", &args_one),
|
||||
ev_function_call("call-2", "shell", &args_two),
|
||||
ev_function_call("call-1", "shell_command", &args_one),
|
||||
ev_function_call("call-2", "shell_command", &args_two),
|
||||
ev_completed("resp-1"),
|
||||
]);
|
||||
let second_response = sse(vec![
|
||||
@@ -167,7 +168,7 @@ async fn non_parallel_tools_run_serially() -> anyhow::Result<()> {
|
||||
]);
|
||||
mount_sse_sequence(&server, vec![first_response, second_response]).await;
|
||||
|
||||
let duration = run_turn_and_measure(&test, "run shell twice").await?;
|
||||
let duration = run_turn_and_measure(&test, "run shell_command twice").await?;
|
||||
assert_serial_duration(duration);
|
||||
|
||||
Ok(())
|
||||
@@ -185,14 +186,14 @@ async fn mixed_tools_fall_back_to_serial() -> anyhow::Result<()> {
|
||||
})
|
||||
.to_string();
|
||||
let shell_args = serde_json::to_string(&json!({
|
||||
"command": ["/bin/sh", "-c", "sleep 0.3"],
|
||||
"command": "sleep 0.3",
|
||||
"timeout_ms": 1_000,
|
||||
}))?;
|
||||
|
||||
let first_response = sse(vec![
|
||||
json!({"type": "response.created", "response": {"id": "resp-1"}}),
|
||||
ev_function_call("call-1", "test_sync_tool", &sync_args),
|
||||
ev_function_call("call-2", "shell", &shell_args),
|
||||
ev_function_call("call-2", "shell_command", &shell_args),
|
||||
ev_completed("resp-1"),
|
||||
]);
|
||||
let second_response = sse(vec![
|
||||
@@ -215,7 +216,7 @@ async fn tool_results_grouped() -> anyhow::Result<()> {
|
||||
let test = build_codex_with_test_tool(&server).await?;
|
||||
|
||||
let shell_args = serde_json::to_string(&json!({
|
||||
"command": ["/bin/sh", "-c", "echo 'shell output'"],
|
||||
"command": "echo 'shell output'",
|
||||
"timeout_ms": 1_000,
|
||||
}))?;
|
||||
|
||||
@@ -223,9 +224,9 @@ async fn tool_results_grouped() -> anyhow::Result<()> {
|
||||
&server,
|
||||
sse(vec![
|
||||
json!({"type": "response.created", "response": {"id": "resp-1"}}),
|
||||
ev_function_call("call-1", "shell", &shell_args),
|
||||
ev_function_call("call-2", "shell", &shell_args),
|
||||
ev_function_call("call-3", "shell", &shell_args),
|
||||
ev_function_call("call-1", "shell_command", &shell_args),
|
||||
ev_function_call("call-2", "shell_command", &shell_args),
|
||||
ev_function_call("call-3", "shell_command", &shell_args),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
|
||||
@@ -98,7 +98,7 @@ async fn truncate_function_error_trims_respond_to_model() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Verifies that a standard tool call (shell) exceeding the model formatting
|
||||
// Verifies that a standard tool call (shell_command) exceeding the model formatting
|
||||
// limits is truncated before being sent back to the model.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
|
||||
@@ -106,7 +106,7 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
||||
// Use a model that exposes the generic shell tool.
|
||||
// Use a model that exposes the shell_command tool.
|
||||
let mut builder = test_codex().with_model("gpt-5.1").with_config(|config| {
|
||||
config.tool_output_token_limit = Some(100_000);
|
||||
});
|
||||
@@ -114,28 +114,22 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
|
||||
let fixture = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-too-large";
|
||||
let args = if cfg!(windows) {
|
||||
serde_json::json!({
|
||||
"command": [
|
||||
"powershell",
|
||||
"-Command",
|
||||
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
|
||||
],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
let command = if cfg!(windows) {
|
||||
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 100000"],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
"seq 1 100000"
|
||||
};
|
||||
let args = serde_json::json!({
|
||||
"command": command,
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
|
||||
// First response: model tells us to run the tool; second: complete the turn.
|
||||
mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
responses::ev_response_created("resp-1"),
|
||||
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
responses::ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -167,7 +161,10 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
|
||||
"expected truncated shell output to be plain text"
|
||||
);
|
||||
|
||||
assert_eq!(output.len(), 400097, "we should be almost 100k tokens");
|
||||
assert!(
|
||||
(400000..=401000).contains(&output.len()),
|
||||
"we should be almost 100k tokens"
|
||||
);
|
||||
|
||||
assert!(
|
||||
!output.contains("tokens truncated"),
|
||||
@@ -177,7 +174,7 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Verifies that a standard tool call (shell) exceeding the model formatting
|
||||
// Verifies that a standard tool call (shell_command) exceeding the model formatting
|
||||
// limits is truncated before being sent back to the model.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
|
||||
@@ -185,34 +182,28 @@ async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
||||
// Use a model that exposes the generic shell tool.
|
||||
// Use a model that exposes the shell_command tool.
|
||||
let mut builder = test_codex().with_model("gpt-5.1");
|
||||
|
||||
let fixture = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-too-large";
|
||||
let args = if cfg!(windows) {
|
||||
serde_json::json!({
|
||||
"command": [
|
||||
"powershell",
|
||||
"-Command",
|
||||
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
|
||||
],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
let command = if cfg!(windows) {
|
||||
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 100000"],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
"seq 1 100000"
|
||||
};
|
||||
let args = serde_json::json!({
|
||||
"command": command,
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
|
||||
// First response: model tells us to run the tool; second: complete the turn.
|
||||
mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
responses::ev_response_created("resp-1"),
|
||||
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
responses::ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -250,14 +241,14 @@ async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
|
||||
|
||||
let len = output.len();
|
||||
assert!(
|
||||
(9_900..=10_000).contains(&len),
|
||||
(9_900..=10_100).contains(&len),
|
||||
"expected ~10k chars after truncation, got {len}"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Verifies that a standard tool call (shell) exceeding the model formatting
|
||||
// Verifies that a standard tool call (shell_command) exceeding the model formatting
|
||||
// limits is truncated before being sent back to the model.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
|
||||
@@ -265,7 +256,7 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
||||
// Use a model that exposes the generic shell tool.
|
||||
// Use a model that exposes the shell_command tool.
|
||||
let mut builder = test_codex().with_config(|config| {
|
||||
config.model = "gpt-5.1-codex".to_string();
|
||||
config.model_family =
|
||||
@@ -274,28 +265,22 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
|
||||
let fixture = builder.build(&server).await?;
|
||||
|
||||
let call_id = "shell-too-large";
|
||||
let args = if cfg!(windows) {
|
||||
serde_json::json!({
|
||||
"command": [
|
||||
"powershell",
|
||||
"-Command",
|
||||
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
|
||||
],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
let command = if cfg!(windows) {
|
||||
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 100000"],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
"seq 1 100000"
|
||||
};
|
||||
let args = serde_json::json!({
|
||||
"command": command,
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
|
||||
// First response: model tells us to run the tool; second: complete the turn.
|
||||
mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
responses::ev_response_created("resp-1"),
|
||||
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
responses::ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -345,7 +330,7 @@ $"#;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Ensures shell tool outputs that exceed the line limit are truncated only once.
|
||||
// Ensures shell_command outputs that exceed the line limit are truncated only once.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn tool_call_output_truncated_only_once() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
@@ -359,27 +344,21 @@ async fn tool_call_output_truncated_only_once() -> Result<()> {
|
||||
});
|
||||
let fixture = builder.build(&server).await?;
|
||||
let call_id = "shell-single-truncation";
|
||||
let args = if cfg!(windows) {
|
||||
serde_json::json!({
|
||||
"command": [
|
||||
"powershell",
|
||||
"-Command",
|
||||
"for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
|
||||
],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
let command = if cfg!(windows) {
|
||||
"for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 10000"],
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
"seq 1 10000"
|
||||
};
|
||||
let args = serde_json::json!({
|
||||
"command": command,
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
|
||||
mount_sse_once(
|
||||
&server,
|
||||
sse(vec![
|
||||
responses::ev_response_created("resp-1"),
|
||||
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
responses::ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -619,7 +598,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
|
||||
|
||||
let call_id = "shell-token-marker";
|
||||
let args = json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 150"],
|
||||
"command": "seq 1 150",
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
|
||||
@@ -627,7 +606,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -650,7 +629,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
|
||||
.function_call_output_text(call_id)
|
||||
.context("shell output present")?;
|
||||
|
||||
let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5\\n.*?…\d+ tokens truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
|
||||
let pattern = r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nTotal output lines: 150\nOutput:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19.*tokens truncated.*129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n$";
|
||||
|
||||
assert_regex_match(pattern, &output);
|
||||
|
||||
@@ -672,7 +651,7 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
|
||||
|
||||
let call_id = "shell-byte-marker";
|
||||
let args = json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 150"],
|
||||
"command": "seq 1 150",
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
|
||||
@@ -680,7 +659,7 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -703,16 +682,16 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
|
||||
.function_call_output_text(call_id)
|
||||
.context("shell output present")?;
|
||||
|
||||
let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5.*?…\d+ chars truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
|
||||
let pattern = r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nTotal output lines: 150\nOutput:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19.*chars truncated.*129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n$";
|
||||
|
||||
assert_regex_match(pattern, &output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Shell tool output should remain intact when the config opts into a large token budget.
|
||||
// shell_command output should remain intact when the config opts into a large token budget.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
|
||||
async fn shell_command_output_not_truncated_with_custom_limit() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
@@ -726,7 +705,7 @@ async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
|
||||
|
||||
let call_id = "shell-no-trunc";
|
||||
let args = json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 1000"],
|
||||
"command": "seq 1 1000",
|
||||
"timeout_ms": 5_000,
|
||||
});
|
||||
let expected_body: String = (1..=1000).map(|i| format!("{i}\n")).collect();
|
||||
@@ -735,7 +714,7 @@ async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
|
||||
@@ -279,23 +279,19 @@ async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
|
||||
config.tool_output_token_limit = Some(100);
|
||||
config.model = "gpt-5.1-codex".to_string();
|
||||
config.model_family =
|
||||
find_family_for_model("gpt-5-codex").expect("gpt-5-codex is a model family");
|
||||
find_family_for_model("gpt-5.1-codex").expect("gpt-5.1-codex is a model family");
|
||||
});
|
||||
let fixture = builder.build(&server).await?;
|
||||
|
||||
let call_id = "user-shell-double-truncation";
|
||||
let args = if cfg!(windows) {
|
||||
serde_json::json!({
|
||||
"command": [
|
||||
"powershell",
|
||||
"-Command",
|
||||
"for ($i=1; $i -le 2000; $i++) { Write-Output $i }"
|
||||
],
|
||||
"command": "for ($i=1; $i -le 2000; $i++) { Write-Output $i }",
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"command": ["/bin/sh", "-c", "seq 1 2000"],
|
||||
"command": "seq 1 2000",
|
||||
"timeout_ms": 5_000,
|
||||
})
|
||||
};
|
||||
@@ -304,7 +300,7 @@ async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
|
||||
&server,
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
|
||||
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
)
|
||||
@@ -319,19 +315,22 @@ async fn user_shell_command_is_truncated_only_once() -> anyhow::Result<()> {
|
||||
.await;
|
||||
|
||||
fixture
|
||||
.submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess)
|
||||
.submit_turn_with_policy(
|
||||
"trigger big shell_command output",
|
||||
SandboxPolicy::DangerFullAccess,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let output = mock2
|
||||
.single_request()
|
||||
.function_call_output_text(call_id)
|
||||
.context("function_call_output present for shell call")?;
|
||||
.context("function_call_output present for shell_command call")?;
|
||||
|
||||
let truncation_headers = output.matches("Total output lines:").count();
|
||||
|
||||
assert_eq!(
|
||||
truncation_headers, 1,
|
||||
"shell output should carry only one truncation header: {output}"
|
||||
"shell_command output should carry only one truncation header: {output}"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
Reference in New Issue
Block a user