Merge remote-tracking branch 'origin/rhan/surface-updates' into rhan/emittance

This commit is contained in:
Roy Han
2026-03-12 11:08:32 -07:00
437 changed files with 65096 additions and 43445 deletions

View File

@@ -1321,7 +1321,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
expectation: Expectation::FileNotCreated {
target: TargetPath::Workspace("ro_never.txt"),
message_contains: if cfg!(target_os = "linux") {
&["Permission denied"]
&["Permission denied|Read-only file system"]
} else {
&[
"Permission denied|Operation not permitted|operation not permitted|\
@@ -1468,7 +1468,7 @@ fn scenarios() -> Vec<ScenarioSpec> {
expectation: Expectation::FileNotCreated {
target: TargetPath::OutsideWorkspace("ww_never.txt"),
message_contains: if cfg!(target_os = "linux") {
&["Permission denied"]
&["Permission denied|Read-only file system"]
} else {
&[
"Permission denied|Operation not permitted|operation not permitted|\
@@ -2290,20 +2290,16 @@ allow_local_binding = true
test.config.permissions.network.is_some(),
"expected managed network proxy config to be present"
);
let runtime_proxy = test
.session_configured
test.session_configured
.network_proxy
.as_ref()
.expect("expected runtime managed network proxy addresses");
let proxy_addr = runtime_proxy.http_addr.as_str();
let call_id_first = "allow-network-first";
// Use the same urllib-based pattern as the other network integration tests,
// but point it at the runtime proxy directly so the blocked host reliably
// produces a network approval request without relying on curl.
let fetch_command = format!(
"python3 -c \"import urllib.request; proxy = urllib.request.ProxyHandler({{'http': 'http://{proxy_addr}'}}); opener = urllib.request.build_opener(proxy); print('OK:' + opener.open('http://codex-network-test.invalid', timeout=30).read().decode(errors='replace'))\""
);
// Use urllib without overriding proxy settings so managed-network sessions
// continue to exercise the env-based proxy routing path under bubblewrap.
let fetch_command = r#"python3 -c "import urllib.request; opener = urllib.request.build_opener(urllib.request.ProxyHandler()); print('OK:' + opener.open('http://codex-network-test.invalid', timeout=30).read().decode(errors='replace'))""#
.to_string();
let first_event = shell_event(
call_id_first,
&fetch_command,

View File

@@ -519,6 +519,7 @@ async fn resume_replays_image_tool_outputs_with_detail() {
item: RolloutItem::ResponseItem(ResponseItem::FunctionCall {
id: None,
name: "view_image".to_string(),
namespace: None,
arguments: "{\"path\":\"/tmp/example.webp\"}".to_string(),
call_id: function_call_id.to_string(),
}),
@@ -1883,6 +1884,7 @@ async fn azure_responses_request_includes_store_and_reasoning_ids() {
prompt.input.push(ResponseItem::FunctionCall {
id: Some("function-id".into()),
name: "do_thing".into(),
namespace: None,
arguments: "{}".into(),
call_id: "function-call-id".into(),
});

View File

@@ -21,6 +21,7 @@ use pretty_assertions::assert_eq;
use serde_json::Value;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::time::Duration;
use wiremock::MockServer;
@@ -32,6 +33,16 @@ fn custom_tool_output_items(req: &ResponsesRequest, call_id: &str) -> Vec<Value>
.clone()
}
fn function_tool_output_items(req: &ResponsesRequest, call_id: &str) -> Vec<Value> {
match req.function_call_output(call_id).get("output") {
Some(Value::Array(items)) => items.clone(),
Some(Value::String(text)) => {
vec![serde_json::json!({ "type": "input_text", "text": text })]
}
_ => panic!("function tool output should be serialized as text or content items"),
}
}
fn text_item(items: &[Value], index: usize) -> &str {
items[index]
.get("text")
@@ -39,6 +50,23 @@ fn text_item(items: &[Value], index: usize) -> &str {
.expect("content item should be input_text")
}
fn extract_running_session_id(text: &str) -> i32 {
text.strip_prefix("Script running with session ID ")
.and_then(|rest| rest.split('\n').next())
.expect("running header should contain a session ID")
.parse()
.expect("session ID should parse as i32")
}
fn wait_for_file_source(path: &Path) -> Result<String> {
let quoted_path = shlex::try_join([path.to_string_lossy().as_ref()])?;
let command = format!("if [ -f {quoted_path} ]; then printf ready; fi");
Ok(format!(
r#"while ((await exec_command({{ cmd: {command:?} }})).output !== "ready") {{
}}"#
))
}
fn custom_tool_output_body_and_success(
req: &ResponsesRequest,
call_id: &str,
@@ -289,6 +317,922 @@ Error:\ boom\n
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_yield_and_resume_with_exec_wait() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
let phase_2_gate = test.workspace_path("code-mode-phase-2.ready");
let phase_3_gate = test.workspace_path("code-mode-phase-3.ready");
let phase_2_wait = wait_for_file_source(&phase_2_gate)?;
let phase_3_wait = wait_for_file_source(&phase_3_gate)?;
let code = format!(
r#"
import {{ output_text, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("phase 1");
set_yield_time(10);
{phase_2_wait}
output_text("phase 2");
{phase_3_wait}
output_text("phase 3");
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &code),
ev_completed("resp-1"),
]),
)
.await;
let first_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "waiting"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("start the long exec").await?;
let first_request = first_completion.single_request();
let first_items = custom_tool_output_items(&first_request, "call-1");
assert_eq!(first_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script running with session ID \d+\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&first_items, 0),
);
assert_eq!(text_item(&first_items, 1), "phase 1");
let session_id = extract_running_session_id(text_item(&first_items, 0));
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-3"),
responses::ev_function_call(
"call-2",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_id,
"yield_time_ms": 1_000,
}))?,
),
ev_completed("resp-3"),
]),
)
.await;
let second_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-2", "still waiting"),
ev_completed("resp-4"),
]),
)
.await;
fs::write(&phase_2_gate, "ready")?;
test.submit_turn("wait again").await?;
let second_request = second_completion.single_request();
let second_items = function_tool_output_items(&second_request, "call-2");
assert_eq!(second_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script running with session ID \d+\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&second_items, 0),
);
assert_eq!(
extract_running_session_id(text_item(&second_items, 0)),
session_id
);
assert_eq!(text_item(&second_items, 1), "phase 2");
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-5"),
responses::ev_function_call(
"call-3",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_id,
"yield_time_ms": 1_000,
}))?,
),
ev_completed("resp-5"),
]),
)
.await;
let third_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-3", "done"),
ev_completed("resp-6"),
]),
)
.await;
fs::write(&phase_3_gate, "ready")?;
test.submit_turn("wait for completion").await?;
let third_request = third_completion.single_request();
let third_items = function_tool_output_items(&third_request, "call-3");
assert_eq!(third_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&third_items, 0),
);
assert_eq!(text_item(&third_items, 1), "phase 3");
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_run_multiple_yielded_sessions() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
let session_a_gate = test.workspace_path("code-mode-session-a.ready");
let session_b_gate = test.workspace_path("code-mode-session-b.ready");
let session_a_wait = wait_for_file_source(&session_a_gate)?;
let session_b_wait = wait_for_file_source(&session_b_gate)?;
let session_a_code = format!(
r#"
import {{ output_text, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("session a start");
set_yield_time(10);
{session_a_wait}
output_text("session a done");
"#
);
let session_b_code = format!(
r#"
import {{ output_text, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("session b start");
set_yield_time(10);
{session_b_wait}
output_text("session b done");
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &session_a_code),
ev_completed("resp-1"),
]),
)
.await;
let first_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "session a waiting"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("start session a").await?;
let first_request = first_completion.single_request();
let first_items = custom_tool_output_items(&first_request, "call-1");
assert_eq!(first_items.len(), 2);
let session_a_id = extract_running_session_id(text_item(&first_items, 0));
assert_eq!(text_item(&first_items, 1), "session a start");
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-3"),
ev_custom_tool_call("call-2", "exec", &session_b_code),
ev_completed("resp-3"),
]),
)
.await;
let second_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-2", "session b waiting"),
ev_completed("resp-4"),
]),
)
.await;
test.submit_turn("start session b").await?;
let second_request = second_completion.single_request();
let second_items = custom_tool_output_items(&second_request, "call-2");
assert_eq!(second_items.len(), 2);
let session_b_id = extract_running_session_id(text_item(&second_items, 0));
assert_eq!(text_item(&second_items, 1), "session b start");
assert_ne!(session_a_id, session_b_id);
fs::write(&session_a_gate, "ready")?;
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-5"),
responses::ev_function_call(
"call-3",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_a_id,
"yield_time_ms": 1_000,
}))?,
),
ev_completed("resp-5"),
]),
)
.await;
let third_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-3", "session a done"),
ev_completed("resp-6"),
]),
)
.await;
test.submit_turn("wait session a").await?;
let third_request = third_completion.single_request();
let third_items = function_tool_output_items(&third_request, "call-3");
assert_eq!(third_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&third_items, 0),
);
assert_eq!(text_item(&third_items, 1), "session a done");
fs::write(&session_b_gate, "ready")?;
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-7"),
responses::ev_function_call(
"call-4",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_b_id,
"yield_time_ms": 1_000,
}))?,
),
ev_completed("resp-7"),
]),
)
.await;
let fourth_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-4", "session b done"),
ev_completed("resp-8"),
]),
)
.await;
test.submit_turn("wait session b").await?;
let fourth_request = fourth_completion.single_request();
let fourth_items = function_tool_output_items(&fourth_request, "call-4");
assert_eq!(fourth_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&fourth_items, 0),
);
assert_eq!(text_item(&fourth_items, 1), "session b done");
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_wait_can_terminate_and_continue() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
let termination_gate = test.workspace_path("code-mode-terminate.ready");
let termination_wait = wait_for_file_source(&termination_gate)?;
let code = format!(
r#"
import {{ output_text, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("phase 1");
set_yield_time(10);
{termination_wait}
output_text("phase 2");
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &code),
ev_completed("resp-1"),
]),
)
.await;
let first_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "waiting"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("start the long exec").await?;
let first_request = first_completion.single_request();
let first_items = custom_tool_output_items(&first_request, "call-1");
assert_eq!(first_items.len(), 2);
let session_id = extract_running_session_id(text_item(&first_items, 0));
assert_eq!(text_item(&first_items, 1), "phase 1");
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-3"),
responses::ev_function_call(
"call-2",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_id,
"terminate": true,
}))?,
),
ev_completed("resp-3"),
]),
)
.await;
let second_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-2", "terminated"),
ev_completed("resp-4"),
]),
)
.await;
test.submit_turn("terminate it").await?;
let second_request = second_completion.single_request();
let second_items = function_tool_output_items(&second_request, "call-2");
assert_eq!(second_items.len(), 1);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script terminated\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&second_items, 0),
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-5"),
ev_custom_tool_call(
"call-3",
"exec",
r#"
import { output_text } from "@openai/code_mode";
output_text("after terminate");
"#,
),
ev_completed("resp-5"),
]),
)
.await;
let third_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-3", "done"),
ev_completed("resp-6"),
]),
)
.await;
test.submit_turn("run another exec").await?;
let third_request = third_completion.single_request();
let third_items = custom_tool_output_items(&third_request, "call-3");
assert_eq!(third_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&third_items, 0),
);
assert_eq!(text_item(&third_items, 1), "after terminate");
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_wait_returns_error_for_unknown_session() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
responses::ev_function_call(
"call-1",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": 999_999,
"yield_time_ms": 1_000,
}))?,
),
ev_completed("resp-1"),
]),
)
.await;
let completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("wait on an unknown exec session").await?;
let request = completion.single_request();
let (_, success) = request
.function_call_output_content_and_success("call-1")
.expect("function tool output should be present");
assert_ne!(success, Some(true));
let items = function_tool_output_items(&request, "call-1");
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script failed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert_eq!(
text_item(&items, 1),
"Script error:\nexec session 999999 not found"
);
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_wait_terminate_returns_completed_session_if_it_finished_in_background()
-> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
let session_a_gate = test.workspace_path("code-mode-session-a-finished.ready");
let session_b_gate = test.workspace_path("code-mode-session-b-blocked.ready");
let session_a_done_marker = test.workspace_path("code-mode-session-a-done.txt");
let session_a_wait = wait_for_file_source(&session_a_gate)?;
let session_b_wait = wait_for_file_source(&session_b_gate)?;
let session_a_done_marker_quoted =
shlex::try_join([session_a_done_marker.to_string_lossy().as_ref()])?;
let session_a_done_command = format!("printf done > {session_a_done_marker_quoted}");
let session_a_code = format!(
r#"
import {{ output_text, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("session a start");
set_yield_time(10);
{session_a_wait}
output_text("session a done");
await exec_command({{ cmd: {session_a_done_command:?} }});
"#
);
let session_b_code = format!(
r#"
import {{ output_text, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("session b start");
set_yield_time(10);
{session_b_wait}
output_text("session b done");
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &session_a_code),
ev_completed("resp-1"),
]),
)
.await;
let first_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "session a waiting"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("start session a").await?;
let first_request = first_completion.single_request();
let first_items = custom_tool_output_items(&first_request, "call-1");
assert_eq!(first_items.len(), 2);
let session_a_id = extract_running_session_id(text_item(&first_items, 0));
assert_eq!(text_item(&first_items, 1), "session a start");
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-3"),
ev_custom_tool_call("call-2", "exec", &session_b_code),
ev_completed("resp-3"),
]),
)
.await;
let second_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-2", "session b waiting"),
ev_completed("resp-4"),
]),
)
.await;
test.submit_turn("start session b").await?;
let second_request = second_completion.single_request();
let second_items = custom_tool_output_items(&second_request, "call-2");
assert_eq!(second_items.len(), 2);
let session_b_id = extract_running_session_id(text_item(&second_items, 0));
assert_eq!(text_item(&second_items, 1), "session b start");
fs::write(&session_a_gate, "ready")?;
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-5"),
responses::ev_function_call(
"call-3",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_b_id,
"yield_time_ms": 1_000,
}))?,
),
ev_completed("resp-5"),
]),
)
.await;
let third_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-3", "session b still waiting"),
ev_completed("resp-6"),
]),
)
.await;
test.submit_turn("wait session b").await?;
let third_request = third_completion.single_request();
let third_items = function_tool_output_items(&third_request, "call-3");
assert_eq!(third_items.len(), 1);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script running with session ID \d+\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&third_items, 0),
);
assert_eq!(
extract_running_session_id(text_item(&third_items, 0)),
session_b_id
);
for _ in 0..100 {
if session_a_done_marker.exists() {
break;
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
assert!(session_a_done_marker.exists());
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-7"),
responses::ev_function_call(
"call-4",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_a_id,
"terminate": true,
}))?,
),
ev_completed("resp-7"),
]),
)
.await;
let fourth_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-4", "session a already done"),
ev_completed("resp-8"),
]),
)
.await;
test.submit_turn("terminate session a").await?;
let fourth_request = fourth_completion.single_request();
let fourth_items = function_tool_output_items(&fourth_request, "call-4");
match fourth_items.len() {
1 => {
assert_regex_match(
concat!(
r"(?s)\A",
r"Script terminated\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&fourth_items, 0),
);
}
2 => {
assert_regex_match(
concat!(
r"(?s)\A",
r"Script (?:completed|terminated)\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&fourth_items, 0),
);
assert_eq!(text_item(&fourth_items, 1), "session a done");
}
other => panic!("unexpected number of content items: {other}"),
}
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_background_keeps_running_on_later_turn_without_exec_wait() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
let resumed_file = test.workspace_path("code-mode-yield-resumed.txt");
let resumed_file_quoted = shlex::try_join([resumed_file.to_string_lossy().as_ref()])?;
let write_file_command = format!("printf resumed > {resumed_file_quoted}");
let wait_for_file_command =
format!("while [ ! -f {resumed_file_quoted} ]; do sleep 0.01; done; printf ready");
let code = format!(
r#"
import {{ background, output_text }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("before yield");
background();
await exec_command({{ cmd: {write_file_command:?} }});
output_text("after yield");
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &code),
ev_completed("resp-1"),
]),
)
.await;
let first_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "exec yielded"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("start yielded exec").await?;
let first_request = first_completion.single_request();
let first_items = custom_tool_output_items(&first_request, "call-1");
assert_eq!(first_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script running with session ID \d+\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&first_items, 0),
);
assert_eq!(text_item(&first_items, 1), "before yield");
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-3"),
responses::ev_function_call(
"call-2",
"exec_command",
&serde_json::to_string(&serde_json::json!({
"cmd": wait_for_file_command,
}))?,
),
ev_completed("resp-3"),
]),
)
.await;
let second_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-2", "file appeared"),
ev_completed("resp-4"),
]),
)
.await;
test.submit_turn("wait for resumed file").await?;
let second_request = second_completion.single_request();
assert!(
second_request
.function_call_output_text("call-2")
.is_some_and(|output| output.ends_with("ready"))
);
assert_eq!(fs::read_to_string(&resumed_file)?, "resumed");
Ok(())
}
#[cfg_attr(windows, ignore = "no exec_command on Windows")]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_exec_wait_uses_its_own_max_tokens_budget() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mut builder = test_codex().with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
});
let test = builder.build(&server).await?;
let completion_gate = test.workspace_path("code-mode-max-tokens.ready");
let completion_wait = wait_for_file_source(&completion_gate)?;
let code = format!(
r#"
import {{ output_text, set_max_output_tokens_per_exec_call, set_yield_time }} from "@openai/code_mode";
import {{ exec_command }} from "tools.js";
output_text("phase 1");
set_max_output_tokens_per_exec_call(100);
set_yield_time(10);
{completion_wait}
output_text("token one token two token three token four token five token six token seven");
"#
);
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", &code),
ev_completed("resp-1"),
]),
)
.await;
let first_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-1", "waiting"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn("start the long exec").await?;
let first_request = first_completion.single_request();
let first_items = custom_tool_output_items(&first_request, "call-1");
assert_eq!(first_items.len(), 2);
assert_eq!(text_item(&first_items, 1), "phase 1");
let session_id = extract_running_session_id(text_item(&first_items, 0));
fs::write(&completion_gate, "ready")?;
responses::mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-3"),
responses::ev_function_call(
"call-2",
"exec_wait",
&serde_json::to_string(&serde_json::json!({
"session_id": session_id,
"yield_time_ms": 1_000,
"max_tokens": 6,
}))?,
),
ev_completed("resp-3"),
]),
)
.await;
let second_completion = responses::mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-2", "done"),
ev_completed("resp-4"),
]),
)
.await;
test.submit_turn("wait for completion").await?;
let second_request = second_completion.single_request();
let second_items = function_tool_output_items(&second_request, "call-2");
assert_eq!(second_items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&second_items, 0),
);
let expected_pattern = r#"(?sx)
\A
Total\ output\ lines:\ 1\n
\n
.*…\d+\ tokens\ truncated….*
\z
"#;
assert_regex_match(expected_pattern, text_item(&second_items, 1));
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_output_serialized_text_via_openai_code_mode_module() -> Result<()> {
skip_if_no_network!(Ok(()));

View File

@@ -687,7 +687,7 @@ async fn resume_conversation(
let auth_manager = codex_core::test_support::auth_manager_from_auth(
codex_core::CodexAuth::from_api_key("dummy"),
);
Box::pin(manager.resume_thread_from_rollout(config.clone(), path, auth_manager))
Box::pin(manager.resume_thread_from_rollout(config.clone(), path, auth_manager, None))
.await
.expect("resume conversation")
.thread
@@ -700,7 +700,7 @@ async fn fork_thread(
path: std::path::PathBuf,
nth_user_message: usize,
) -> Arc<CodexThread> {
Box::pin(manager.fork_thread(nth_user_message, config.clone(), path, false))
Box::pin(manager.fork_thread(nth_user_message, config.clone(), path, false, None))
.await
.expect("fork conversation")
.thread

View File

@@ -110,7 +110,7 @@ async fn fork_thread_twice_drops_to_first_message() {
thread: codex_fork1,
..
} = thread_manager
.fork_thread(1, config_for_fork.clone(), base_path.clone(), false)
.fork_thread(1, config_for_fork.clone(), base_path.clone(), false, None)
.await
.expect("fork 1");
@@ -129,7 +129,7 @@ async fn fork_thread_twice_drops_to_first_message() {
thread: codex_fork2,
..
} = thread_manager
.fork_thread(0, config_for_fork.clone(), fork1_path.clone(), false)
.fork_thread(0, config_for_fork.clone(), fork1_path.clone(), false, None)
.await
.expect("fork 2");

View File

@@ -659,6 +659,34 @@ async fn js_repl_does_not_expose_process_global() -> Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn js_repl_exposes_codex_path_helpers() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let mock = run_js_repl_turn(
&server,
"check codex path helpers",
&[(
"call-1",
"console.log(`cwd:${typeof codex.cwd}:${codex.cwd.length > 0}`); console.log(`home:${codex.homeDir === null || typeof codex.homeDir === \"string\"}`);",
)],
)
.await?;
let req = mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"js_repl call failed unexpectedly: {output}"
);
assert!(output.contains("cwd:string:true"));
assert!(output.contains("home:true"));
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn js_repl_blocks_sensitive_builtin_imports() -> Result<()> {
skip_if_no_network!(Ok(()));

View File

@@ -416,7 +416,7 @@ async fn resume_and_fork_append_permissions_messages() -> Result<()> {
fork_config.permissions.approval_policy = Constrained::allow_any(AskForApproval::UnlessTrusted);
let forked = initial
.thread_manager
.fork_thread(usize::MAX, fork_config, rollout_path, false)
.fork_thread(usize::MAX, fork_config, rollout_path, false, None)
.await?;
forked
.thread

View File

@@ -277,7 +277,7 @@ async fn explicit_plugin_mentions_inject_plugin_guidance() -> Result<()> {
assert!(
request_tools
.iter()
.any(|name| name == "mcp__codex_apps__calendar_create_event"),
.any(|name| name == "mcp__codex_apps__google-calendar-create-event"),
"expected plugin app tools to become visible for this turn: {request_tools:?}"
);
let echo_description = tool_description(&request_body, "mcp__sample__echo")
@@ -286,9 +286,11 @@ async fn explicit_plugin_mentions_inject_plugin_guidance() -> Result<()> {
echo_description.contains("This tool is part of plugin `sample`."),
"expected plugin MCP provenance in tool description: {echo_description:?}"
);
let calendar_description =
tool_description(&request_body, "mcp__codex_apps__calendar_create_event")
.expect("plugin app tool description should be present");
let calendar_description = tool_description(
&request_body,
"mcp__codex_apps__google-calendar-create-event",
)
.expect("plugin app tool description should be present");
assert!(
calendar_description.contains("This tool is part of plugin `sample`."),
"expected plugin app provenance in tool description: {calendar_description:?}"

View File

@@ -98,7 +98,7 @@ async fn emits_warning_when_resumed_model_differs() {
thread: conversation,
..
} = thread_manager
.resume_thread_with_history(config, initial_history, auth_manager, false)
.resume_thread_with_history(config, initial_history, auth_manager, false, None)
.await
.expect("resume conversation");

File diff suppressed because it is too large Load Diff

View File

@@ -180,6 +180,24 @@ async fn spawn_agent_description_lists_visible_models_and_reasoning_efforts() ->
!description.contains("Hidden Model"),
"hidden picker model should be omitted from spawn_agent description: {description:?}"
);
assert!(
description.contains(
"Only use `spawn_agent` if and only if the user explicitly asks for sub-agents, delegation, or parallel agent work."
),
"expected explicit authorization rule in spawn_agent description: {description:?}"
);
assert!(
description.contains(
"Requests for depth, thoroughness, research, investigation, or detailed codebase analysis do not count as permission to spawn."
),
"expected non-authorization clarification in spawn_agent description: {description:?}"
);
assert!(
description.contains(
"Agent-role guidance below only helps choose which agent to use after spawning is already authorized; it never authorizes spawning by itself."
),
"expected agent-role clarification in spawn_agent description: {description:?}"
);
Ok(())
}

View File

@@ -42,7 +42,7 @@ async fn emits_warning_when_unstable_features_enabled_via_config() {
thread: conversation,
..
} = thread_manager
.resume_thread_with_history(config, InitialHistory::New, auth_manager, false)
.resume_thread_with_history(config, InitialHistory::New, auth_manager, false, None)
.await
.expect("spawn conversation");
@@ -83,7 +83,7 @@ async fn suppresses_warning_when_configured() {
thread: conversation,
..
} = thread_manager
.resume_thread_with_history(config, InitialHistory::New, auth_manager, false)
.resume_thread_with_history(config, InitialHistory::New, auth_manager, false, None)
.await
.expect("spawn conversation");

View File

@@ -296,7 +296,8 @@ async fn view_image_tool_attaches_local_image() -> anyhow::Result<()> {
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_can_preserve_original_resolution_on_gpt5_3_codex() -> anyhow::Result<()> {
async fn view_image_tool_can_preserve_original_resolution_when_requested_on_gpt5_3_codex()
-> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
@@ -326,7 +327,7 @@ async fn view_image_tool_can_preserve_original_resolution_on_gpt5_3_codex() -> a
image.save(&abs_path)?;
let call_id = "view-image-original";
let arguments = serde_json::json!({ "path": rel_path }).to_string();
let arguments = serde_json::json!({ "path": rel_path, "detail": "original" }).to_string();
let first_response = sse(vec![
ev_response_created("resp-1"),
@@ -400,7 +401,191 @@ async fn view_image_tool_can_preserve_original_resolution_on_gpt5_3_codex() -> a
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_keeps_legacy_behavior_below_gpt5_3_codex() -> anyhow::Result<()> {
async fn view_image_tool_errors_clearly_for_unsupported_detail_values() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex()
.with_model("gpt-5.3-codex")
.with_config(|config| {
config
.features
.enable(Feature::ImageDetailOriginal)
.expect("test config should allow feature update");
});
let TestCodex {
codex,
cwd,
session_configured,
..
} = builder.build(&server).await?;
let rel_path = "assets/unsupported-detail.png";
let abs_path = cwd.path().join(rel_path);
if let Some(parent) = abs_path.parent() {
std::fs::create_dir_all(parent)?;
}
let image = ImageBuffer::from_pixel(256, 128, Rgba([0u8, 80, 255, 255]));
image.save(&abs_path)?;
let call_id = "view-image-unsupported-detail";
let arguments = serde_json::json!({ "path": rel_path, "detail": "low" }).to_string();
let first_response = sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "view_image", &arguments),
ev_completed("resp-1"),
]);
responses::mount_sse_once(&server, first_response).await;
let second_response = sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]);
let mock = responses::mount_sse_once(&server, second_response).await;
let session_model = session_configured.model.clone();
codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "please attach the image at low detail".into(),
text_elements: Vec::new(),
}],
final_output_json_schema: None,
cwd: cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::DangerFullAccess,
model: session_model,
effort: None,
service_tier: None,
summary: None,
collaboration_mode: None,
personality: None,
})
.await?;
wait_for_event(&codex, |event| matches!(event, EventMsg::TurnComplete(_))).await;
let req = mock.single_request();
let body_with_tool_output = req.body_json();
let output_text = req
.function_call_output_content_and_success(call_id)
.and_then(|(content, _)| content)
.expect("output text present");
assert_eq!(
output_text,
"view_image.detail only supports `original`; omit `detail` for default resized behavior, got `low`"
);
assert!(
find_image_message(&body_with_tool_output).is_none(),
"unsupported detail values should not produce an input_image message"
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_treats_null_detail_as_omitted() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex()
.with_model("gpt-5.3-codex")
.with_config(|config| {
config
.features
.enable(Feature::ImageDetailOriginal)
.expect("test config should allow feature update");
});
let TestCodex {
codex,
cwd,
session_configured,
..
} = builder.build(&server).await?;
let rel_path = "assets/null-detail.png";
let abs_path = cwd.path().join(rel_path);
if let Some(parent) = abs_path.parent() {
std::fs::create_dir_all(parent)?;
}
let original_width = 2304;
let original_height = 864;
let image = ImageBuffer::from_pixel(original_width, original_height, Rgba([0u8, 80, 255, 255]));
image.save(&abs_path)?;
let call_id = "view-image-null-detail";
let arguments = serde_json::json!({ "path": rel_path, "detail": null }).to_string();
let first_response = sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "view_image", &arguments),
ev_completed("resp-1"),
]);
responses::mount_sse_once(&server, first_response).await;
let second_response = sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]);
let mock = responses::mount_sse_once(&server, second_response).await;
let session_model = session_configured.model.clone();
codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "please attach the image with a null detail".into(),
text_elements: Vec::new(),
}],
final_output_json_schema: None,
cwd: cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::DangerFullAccess,
model: session_model,
effort: None,
service_tier: None,
summary: None,
collaboration_mode: None,
personality: None,
})
.await?;
wait_for_event(&codex, |event| matches!(event, EventMsg::TurnComplete(_))).await;
let req = mock.single_request();
let function_output = req.function_call_output(call_id);
let output_items = function_output
.get("output")
.and_then(Value::as_array)
.expect("function_call_output should be a content item array");
assert_eq!(output_items.len(), 1);
assert_eq!(output_items[0].get("detail"), None);
let image_url = output_items[0]
.get("image_url")
.and_then(Value::as_str)
.expect("image_url present");
let (_, encoded) = image_url
.split_once(',')
.expect("image url contains data prefix");
let decoded = BASE64_STANDARD
.decode(encoded)
.expect("image data decodes from base64 for request");
let resized = load_from_memory(&decoded).expect("load resized image");
let (width, height) = resized.dimensions();
assert!(width <= 2048);
assert!(height <= 768);
assert!(width < original_width);
assert!(height < original_height);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_resizes_when_model_lacks_original_detail_support() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
@@ -503,6 +688,110 @@ async fn view_image_tool_keeps_legacy_behavior_below_gpt5_3_codex() -> anyhow::R
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn view_image_tool_does_not_force_original_resolution_with_capability_feature_only()
-> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
let mut builder = test_codex()
.with_model("gpt-5.3-codex")
.with_config(|config| {
config
.features
.enable(Feature::ImageDetailOriginal)
.expect("test config should allow feature update");
});
let TestCodex {
codex,
cwd,
session_configured,
..
} = builder.build(&server).await?;
let rel_path = "assets/original-example-capability-only.png";
let abs_path = cwd.path().join(rel_path);
if let Some(parent) = abs_path.parent() {
std::fs::create_dir_all(parent)?;
}
let original_width = 2304;
let original_height = 864;
let image = ImageBuffer::from_pixel(original_width, original_height, Rgba([0u8, 80, 255, 255]));
image.save(&abs_path)?;
let call_id = "view-image-capability-only";
let arguments = serde_json::json!({ "path": rel_path }).to_string();
let first_response = sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "view_image", &arguments),
ev_completed("resp-1"),
]);
responses::mount_sse_once(&server, first_response).await;
let second_response = sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]);
let mock = responses::mount_sse_once(&server, second_response).await;
let session_model = session_configured.model.clone();
codex
.submit(Op::UserTurn {
items: vec![UserInput::Text {
text: "please add the screenshot".into(),
text_elements: Vec::new(),
}],
final_output_json_schema: None,
cwd: cwd.path().to_path_buf(),
approval_policy: AskForApproval::Never,
sandbox_policy: SandboxPolicy::DangerFullAccess,
model: session_model,
effort: None,
service_tier: None,
summary: None,
collaboration_mode: None,
personality: None,
})
.await?;
wait_for_event_with_timeout(
&codex,
|event| matches!(event, EventMsg::TurnComplete(_)),
Duration::from_secs(10),
)
.await;
let req = mock.single_request();
let function_output = req.function_call_output(call_id);
let output_items = function_output
.get("output")
.and_then(Value::as_array)
.expect("function_call_output should be a content item array");
assert_eq!(output_items.len(), 1);
assert_eq!(output_items[0].get("detail"), None);
let image_url = output_items[0]
.get("image_url")
.and_then(Value::as_str)
.expect("image_url present");
let (_, encoded) = image_url
.split_once(',')
.expect("image url contains data prefix");
let decoded = BASE64_STANDARD
.decode(encoded)
.expect("image data decodes from base64 for request");
let resized = load_from_memory(&decoded).expect("load resized image");
let (resized_width, resized_height) = resized.dimensions();
assert!(resized_width <= 2048);
assert!(resized_height <= 768);
assert!(resized_width < original_width);
assert!(resized_height < original_height);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn js_repl_emit_image_attaches_local_image() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));