mirror of
https://github.com/openai/codex.git
synced 2026-04-24 14:45:27 +00:00
add failing tests
This commit is contained in:
@@ -1,16 +1,22 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use codex_core::CodexAuth;
|
||||
use codex_core::ConversationManager;
|
||||
use codex_core::ModelProviderInfo;
|
||||
use codex_core::NewConversation;
|
||||
use codex_core::built_in_model_providers;
|
||||
use codex_core::protocol::AskForApproval;
|
||||
use codex_core::protocol::ErrorEvent;
|
||||
use codex_core::protocol::EventMsg;
|
||||
use codex_core::protocol::InputItem;
|
||||
use codex_core::protocol::Op;
|
||||
use codex_core::protocol::RolloutItem;
|
||||
use codex_core::protocol::RolloutLine;
|
||||
use codex_core::protocol::SandboxPolicy;
|
||||
use codex_protocol::config_types::ReasoningSummary;
|
||||
use core_test_support::load_default_config_for_test;
|
||||
use core_test_support::skip_if_no_network;
|
||||
use core_test_support::test_codex::test_codex;
|
||||
use core_test_support::wait_for_event;
|
||||
use tempfile::TempDir;
|
||||
|
||||
@@ -19,6 +25,7 @@ use core_test_support::responses::ev_assistant_message;
|
||||
use core_test_support::responses::ev_completed;
|
||||
use core_test_support::responses::ev_completed_with_tokens;
|
||||
use core_test_support::responses::ev_function_call;
|
||||
use core_test_support::responses::ev_response_created;
|
||||
use core_test_support::responses::mount_sse_once_match;
|
||||
use core_test_support::responses::mount_sse_sequence;
|
||||
use core_test_support::responses::sse;
|
||||
@@ -749,6 +756,148 @@ async fn manual_compact_retries_after_context_window_error() {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn compact_retry_drops_orphan_tool_outputs() {
|
||||
skip_if_no_network!();
|
||||
let server = start_mock_server().await;
|
||||
let test = test_codex().build(&server).await.unwrap();
|
||||
|
||||
let call_id = "compact-orphan-call";
|
||||
let shell_args = serde_json::json!({
|
||||
"command": ["/bin/echo", "orphan check"],
|
||||
"timeout_ms": 1_000
|
||||
});
|
||||
let shell_args_json = serde_json::to_string(&shell_args).expect("serialize shell args");
|
||||
|
||||
let initial_turn = sse(vec![
|
||||
ev_response_created("resp-initial"),
|
||||
ev_function_call(call_id, "shell", &shell_args_json),
|
||||
ev_completed("resp-initial"),
|
||||
]);
|
||||
let tool_follow_up = sse(vec![
|
||||
ev_response_created("resp-tool"),
|
||||
ev_assistant_message("msg-tool", "tool output acknowledged"),
|
||||
ev_completed("resp-tool"),
|
||||
]);
|
||||
const CONTEXT_FAILS: usize = 4;
|
||||
let compact_failures: Vec<String> = (0..CONTEXT_FAILS)
|
||||
.map(|idx| {
|
||||
let id = format!("resp-compact-fail-{idx}");
|
||||
sse_failed(&id, "context_length_exceeded", CONTEXT_LIMIT_MESSAGE)
|
||||
})
|
||||
.collect();
|
||||
let compact_success = sse(vec![
|
||||
ev_response_created("resp-compact-success"),
|
||||
ev_assistant_message("msg-compact", SUMMARY_TEXT),
|
||||
ev_completed("resp-compact-success"),
|
||||
]);
|
||||
|
||||
let mut responses = vec![initial_turn, tool_follow_up];
|
||||
responses.extend(compact_failures.clone());
|
||||
responses.push(compact_success);
|
||||
let response_log = mount_sse_sequence(&server, responses).await;
|
||||
|
||||
let session_model = test.session_configured.model.clone();
|
||||
let turn = |prompt: &str| Op::UserTurn {
|
||||
items: vec![InputItem::Text {
|
||||
text: prompt.into(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
model: session_model.clone(),
|
||||
effort: None,
|
||||
summary: ReasoningSummary::Auto,
|
||||
};
|
||||
|
||||
test.codex
|
||||
.submit(turn("trigger tool call"))
|
||||
.await
|
||||
.expect("initial turn submission");
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
||||
|
||||
test.codex
|
||||
.submit(Op::Compact)
|
||||
.await
|
||||
.expect("compact submission");
|
||||
|
||||
let EventMsg::BackgroundEvent(background_event) =
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::BackgroundEvent(_))).await
|
||||
else {
|
||||
unreachable!("expected background event after compact retries");
|
||||
};
|
||||
assert!(
|
||||
background_event.message.contains(&format!(
|
||||
"Trimmed {CONTEXT_FAILS} older conversation item(s)"
|
||||
)),
|
||||
"unexpected trim count reported: {}",
|
||||
background_event.message
|
||||
);
|
||||
wait_for_event(&test.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
||||
|
||||
let requests = response_log.requests();
|
||||
assert_eq!(
|
||||
requests.len(),
|
||||
CONTEXT_FAILS + 3,
|
||||
"expected initial turn, tool output, {CONTEXT_FAILS} failed compact attempts, and one success"
|
||||
);
|
||||
|
||||
let first_compact_input = requests[2].input();
|
||||
let first_call_ids = collect_call_ids(&first_compact_input);
|
||||
let first_output_ids = collect_output_ids(&first_compact_input);
|
||||
assert!(
|
||||
first_call_ids.contains(call_id),
|
||||
"first compact attempt should include the tool call"
|
||||
);
|
||||
assert!(
|
||||
first_output_ids.contains(call_id),
|
||||
"first compact attempt should include the tool output"
|
||||
);
|
||||
|
||||
let final_retry_index = 2 + CONTEXT_FAILS;
|
||||
let retry_input = requests[final_retry_index].input();
|
||||
let retry_call_ids = collect_call_ids(&retry_input);
|
||||
let retry_output_ids = collect_output_ids(&retry_input);
|
||||
assert_eq!(
|
||||
retry_output_ids, retry_call_ids,
|
||||
"compact retry should remove tool outputs when their call was trimmed"
|
||||
);
|
||||
}
|
||||
|
||||
fn collect_call_ids(items: &[serde_json::Value]) -> BTreeSet<String> {
|
||||
items
|
||||
.iter()
|
||||
.filter_map(|item| {
|
||||
let item_type = item.get("type")?.as_str()?;
|
||||
match item_type {
|
||||
"function_call" | "custom_tool_call" | "local_shell_call" => item
|
||||
.get("call_id")
|
||||
.or_else(|| item.get("id"))
|
||||
.and_then(serde_json::Value::as_str)
|
||||
.map(str::to_string),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn collect_output_ids(items: &[serde_json::Value]) -> BTreeSet<String> {
|
||||
items
|
||||
.iter()
|
||||
.filter_map(|item| {
|
||||
let item_type = item.get("type")?.as_str()?;
|
||||
match item_type {
|
||||
"function_call_output" | "custom_tool_call_output" => item
|
||||
.get("call_id")
|
||||
.and_then(serde_json::Value::as_str)
|
||||
.map(str::to_string),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_events() {
|
||||
skip_if_no_network!();
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#![allow(clippy::unwrap_used, clippy::expect_used)]
|
||||
|
||||
use anyhow::Result;
|
||||
use codex_core::error::CodexErr;
|
||||
use codex_core::model_family::find_family_for_model;
|
||||
use codex_core::protocol::AskForApproval;
|
||||
use codex_core::protocol::EventMsg;
|
||||
@@ -18,14 +19,17 @@ use core_test_support::responses::ev_response_created;
|
||||
use core_test_support::responses::mount_sse_once;
|
||||
use core_test_support::responses::mount_sse_sequence;
|
||||
use core_test_support::responses::sse;
|
||||
use core_test_support::responses::sse_failed;
|
||||
use core_test_support::responses::start_mock_server;
|
||||
use core_test_support::skip_if_no_network;
|
||||
use core_test_support::test_codex::TestCodex;
|
||||
use core_test_support::test_codex::test_codex;
|
||||
use core_test_support::wait_for_event;
|
||||
use core_test_support::wait_for_event_with_timeout;
|
||||
use regex_lite::Regex;
|
||||
use serde_json::Value;
|
||||
use serde_json::json;
|
||||
use std::time::Duration;
|
||||
|
||||
async fn submit_turn(
|
||||
test: &TestCodex,
|
||||
@@ -50,9 +54,11 @@ async fn submit_turn(
|
||||
})
|
||||
.await?;
|
||||
|
||||
wait_for_event(&test.codex, |event| {
|
||||
matches!(event, EventMsg::TaskComplete(_))
|
||||
})
|
||||
wait_for_event_with_timeout(
|
||||
&test.codex,
|
||||
|event| matches!(event, EventMsg::TaskComplete(_) | EventMsg::TurnAborted(_)),
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
@@ -80,8 +86,7 @@ async fn custom_tool_unknown_returns_custom_output_error() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex();
|
||||
let test = builder.build(&server).await?;
|
||||
let test = test_codex().build(&server).await?;
|
||||
|
||||
let call_id = "custom-unsupported";
|
||||
let tool_name = "unsupported_tool";
|
||||
@@ -231,8 +236,7 @@ async fn local_shell_missing_ids_maps_to_function_output_error() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let mut builder = test_codex();
|
||||
let test = builder.build(&server).await?;
|
||||
let test = test_codex().build(&server).await?;
|
||||
|
||||
let local_shell_event = json!({
|
||||
"type": "response.output_item.done",
|
||||
@@ -403,6 +407,242 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn context_error_keeps_tool_output_in_history() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let test = test_codex().build(&server).await?;
|
||||
|
||||
let call_id = "context-shell";
|
||||
let shell_args = json!({
|
||||
"command": ["/bin/echo", "context tool output"],
|
||||
"timeout_ms": 1_000
|
||||
});
|
||||
|
||||
let sequence = mount_sse_sequence(
|
||||
&server,
|
||||
vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(
|
||||
call_id,
|
||||
"shell",
|
||||
&serde_json::to_string(&shell_args)?,
|
||||
),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse_failed(
|
||||
"resp-context-error",
|
||||
"context_length_exceeded",
|
||||
"Your input exceeds the context window of this model. Please adjust your input and try again.",
|
||||
),
|
||||
sse(vec![
|
||||
ev_response_created("resp-3"),
|
||||
ev_assistant_message("msg-1", "handled"),
|
||||
ev_completed("resp-3"),
|
||||
]),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
let turn = |prompt: &str| Op::UserTurn {
|
||||
items: vec![InputItem::Text {
|
||||
text: prompt.into(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
model: test.session_configured.model.clone(),
|
||||
effort: None,
|
||||
summary: ReasoningSummary::Auto,
|
||||
};
|
||||
|
||||
test.codex.submit(turn("trigger tool call")).await?;
|
||||
|
||||
let error_event =
|
||||
wait_for_event(&test.codex, |event| matches!(event, EventMsg::Error(_))).await;
|
||||
let EventMsg::Error(error_payload) = error_event else {
|
||||
unreachable!("wait_for_event returned unexpected event");
|
||||
};
|
||||
assert_eq!(
|
||||
error_payload.message,
|
||||
CodexErr::ContextWindowExceeded.to_string(),
|
||||
"expected context window error after second SSE"
|
||||
);
|
||||
wait_for_event_with_timeout(
|
||||
&test.codex,
|
||||
|event| matches!(event, EventMsg::TaskComplete(_)),
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
let failure_request = sequence
|
||||
.requests()
|
||||
.get(1)
|
||||
.expect("failed request missing")
|
||||
.clone();
|
||||
let tool_output_item = failure_request.function_call_output(call_id);
|
||||
let tool_output_str = tool_output_item
|
||||
.get("output")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default();
|
||||
assert!(
|
||||
!tool_output_str.is_empty(),
|
||||
"tool call output missing from follow-up request"
|
||||
);
|
||||
assert!(
|
||||
tool_output_str.contains("Exit code: 0"),
|
||||
"expected shell tool output to include exit code: {tool_output_str}"
|
||||
);
|
||||
let tool_stdout = tool_output_str;
|
||||
assert!(
|
||||
tool_stdout.contains("context tool output"),
|
||||
"unexpected shell stdout: {tool_stdout}"
|
||||
);
|
||||
|
||||
test.codex.submit(turn("send another message")).await?;
|
||||
wait_for_event_with_timeout(
|
||||
&test.codex,
|
||||
|event| matches!(event, EventMsg::TaskComplete(_)),
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
let follow_up_request = sequence
|
||||
.requests()
|
||||
.last()
|
||||
.expect("follow-up request missing")
|
||||
.clone();
|
||||
let follow_up_output = follow_up_request.function_call_output(call_id);
|
||||
let follow_up_output_str = follow_up_output
|
||||
.get("output")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default();
|
||||
assert_eq!(
|
||||
follow_up_output_str, tool_output_str,
|
||||
"conversation history should retain tool output after context error"
|
||||
);
|
||||
|
||||
let inputs = follow_up_request.input();
|
||||
let user_message = inputs
|
||||
.iter()
|
||||
.rev()
|
||||
.find(|value| value.get("role").is_some_and(|role| role == "user"))
|
||||
.expect("follow-up user message missing from request");
|
||||
let user_text = user_message["content"][0]["text"]
|
||||
.as_str()
|
||||
.unwrap_or_default();
|
||||
assert_eq!(
|
||||
user_text, "send another message",
|
||||
"unexpected follow-up user prompt"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn context_error_replays_tool_output_on_follow_up() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_mock_server().await;
|
||||
let test = test_codex().build(&server).await?;
|
||||
|
||||
let call_id = "context-shell-aborted";
|
||||
let shell_args = json!({
|
||||
"command": ["/bin/echo", "context tool output"],
|
||||
"timeout_ms": 1_000
|
||||
});
|
||||
|
||||
let sequence = mount_sse_sequence(
|
||||
&server,
|
||||
vec![
|
||||
sse(vec![
|
||||
ev_response_created("resp-1"),
|
||||
ev_function_call(
|
||||
call_id,
|
||||
"shell",
|
||||
&serde_json::to_string(&shell_args)?,
|
||||
),
|
||||
ev_completed("resp-1"),
|
||||
]),
|
||||
sse_failed(
|
||||
"resp-context-error",
|
||||
"context_length_exceeded",
|
||||
"Your input exceeds the context window of this model. Please adjust your input and try again.",
|
||||
),
|
||||
sse(vec![
|
||||
ev_response_created("resp-2"),
|
||||
ev_assistant_message("msg-2", "follow-up handled"),
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
let turn = |prompt: &str| Op::UserTurn {
|
||||
items: vec![InputItem::Text {
|
||||
text: prompt.into(),
|
||||
}],
|
||||
final_output_json_schema: None,
|
||||
cwd: test.cwd.path().to_path_buf(),
|
||||
approval_policy: AskForApproval::Never,
|
||||
sandbox_policy: SandboxPolicy::DangerFullAccess,
|
||||
model: test.session_configured.model.clone(),
|
||||
effort: None,
|
||||
summary: ReasoningSummary::Auto,
|
||||
};
|
||||
|
||||
test.codex
|
||||
.submit(turn("trigger context window failure"))
|
||||
.await?;
|
||||
|
||||
let error_event =
|
||||
wait_for_event(&test.codex, |event| matches!(event, EventMsg::Error(_))).await;
|
||||
let EventMsg::Error(error_payload) = error_event else {
|
||||
unreachable!("wait_for_event returned unexpected event");
|
||||
};
|
||||
assert_eq!(
|
||||
error_payload.message,
|
||||
CodexErr::ContextWindowExceeded.to_string(),
|
||||
"expected context window error after failed SSE"
|
||||
);
|
||||
|
||||
wait_for_event_with_timeout(
|
||||
&test.codex,
|
||||
|event| matches!(event, EventMsg::TaskComplete(_)),
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
test.codex.submit(turn("resume after failure")).await?;
|
||||
|
||||
wait_for_event_with_timeout(
|
||||
&test.codex,
|
||||
|event| matches!(event, EventMsg::TaskComplete(_)),
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
let follow_up_request = sequence
|
||||
.requests()
|
||||
.last()
|
||||
.expect("follow-up request missing")
|
||||
.clone();
|
||||
let output_item = follow_up_request.function_call_output(call_id);
|
||||
let output = output_item
|
||||
.get("output")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default();
|
||||
assert!(
|
||||
output.contains("context tool output"),
|
||||
"failed tool call should still record a response"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
Reference in New Issue
Block a user