Prefix code mode output with success or failure message and include error stack (#14272)

This commit is contained in:
pakrym-oai
2026-03-10 18:33:52 -07:00
committed by Michael Bolin
parent da74da6684
commit 01792a4c61
5 changed files with 211 additions and 89 deletions

View File

@@ -24,14 +24,35 @@ use std::fs;
use std::time::Duration;
use wiremock::MockServer;
fn custom_tool_output_text_and_success(
fn custom_tool_output_items(req: &ResponsesRequest, call_id: &str) -> Vec<Value> {
req.custom_tool_call_output(call_id)
.get("output")
.and_then(Value::as_array)
.expect("custom tool output should be serialized as content items")
.clone()
}
fn text_item(items: &[Value], index: usize) -> &str {
items[index]
.get("text")
.and_then(Value::as_str)
.expect("content item should be input_text")
}
fn custom_tool_output_body_and_success(
req: &ResponsesRequest,
call_id: &str,
) -> (String, Option<bool>) {
let (output, success) = req
let (_, success) = req
.custom_tool_call_output_content_and_success(call_id)
.expect("custom tool output should be present");
(output.unwrap_or_default(), success)
let items = custom_tool_output_items(req, call_id);
let output = items
.iter()
.skip(1)
.filter_map(|item| item.get("text").and_then(Value::as_str))
.collect();
(output, success)
}
async fn run_code_mode_turn(
@@ -152,13 +173,16 @@ add_content(JSON.stringify(await exec_command({ cmd: "printf code_mode_exec_mark
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"exec call failed unexpectedly: {output}"
let items = custom_tool_output_items(&req, "call-1");
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
let parsed: Value = serde_json::from_str(&output)?;
let parsed: Value = serde_json::from_str(text_item(&items, 1))?;
assert!(
parsed
.get("chunk_id")
@@ -201,22 +225,66 @@ add_content(JSON.stringify(await exec_command({
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"exec call failed unexpectedly: {output}"
let items = custom_tool_output_items(&req, "call-1");
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
let expected_pattern = r#"(?sx)
\A
Original\ token\ count:\ \d+\n
Output:\n
Total\ output\ lines:\ 1\n
\n
\{"chunk_id".*…\d+\ tokens\ truncated….*
.*…\d+\ tokens\ truncated….*
\z
"#;
assert_regex_match(expected_pattern, &output);
assert_regex_match(expected_pattern, text_item(&items, 1));
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_returns_accumulated_output_when_script_fails() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
&server,
"use code_mode to surface script failures",
r#"
add_content("before crash");
add_content("still before crash");
throw new Error("boom");
"#,
false,
)
.await?;
let req = second_mock.single_request();
let items = custom_tool_output_items(&req, "call-1");
assert_eq!(items.len(), 4);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script failed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert_eq!(text_item(&items, 1), "before crash");
assert_eq!(text_item(&items, 2), "still before crash");
assert_regex_match(
r#"(?sx)
\A
Script\ error:\n
Error:\ boom\n
(?:\s+at\ .+\n?)+
\z
"#,
text_item(&items, 3),
);
Ok(())
}
@@ -239,7 +307,7 @@ output_text({ json: true });
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -270,14 +338,25 @@ output_text(circular);
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = req
.custom_tool_call_output_content_and_success("call-1")
.expect("custom tool output should be present");
assert_ne!(
success,
Some(true),
"circular stringify unexpectedly succeeded"
);
assert!(output.contains("exec execution failed"));
assert!(output.contains("Converting circular structure to JSON"));
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script failed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert!(text_item(&items, 1).contains("Script error:"));
assert!(text_item(&items, 1).contains("Converting circular structure to JSON"));
Ok(())
}
@@ -301,28 +380,34 @@ output_image("data:image/png;base64,AAA");
.await?;
let req = second_mock.single_request();
let (_, success) = custom_tool_output_text_and_success(&req, "call-1");
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"code_mode image output failed unexpectedly"
);
assert_eq!(items.len(), 3);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert_eq!(
req.custom_tool_call_output("call-1"),
items[1],
serde_json::json!({
"type": "custom_tool_call_output",
"call_id": "call-1",
"output": [
{
"type": "input_image",
"image_url": "https://example.com/image.jpg"
},
{
"type": "input_image",
"image_url": "data:image/png;base64,AAA"
}
]
})
"type": "input_image",
"image_url": "https://example.com/image.jpg"
}),
);
assert_eq!(
items[2],
serde_json::json!({
"type": "input_image",
"image_url": "data:image/png;base64,AAA"
}),
);
Ok(())
@@ -345,11 +430,22 @@ async fn code_mode_can_apply_patch_via_nested_tool() -> Result<()> {
run_code_mode_turn(&server, "use exec to run apply_patch", &code, true).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = req
.custom_tool_call_output_content_and_success("call-1")
.expect("custom tool output should be present");
assert_ne!(
success,
Some(false),
"exec apply_patch call failed unexpectedly: {output}"
"exec apply_patch call failed unexpectedly: {items:?}"
);
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
let file_path = test.cwd_path().join(file_name);
@@ -381,7 +477,7 @@ add_content(
run_code_mode_turn_with_rmcp(&server, "use exec to run the rmcp echo tool", code).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -420,7 +516,7 @@ add_content(
run_code_mode_turn_with_rmcp(&server, "use exec to run the rmcp echo tool", code).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -464,7 +560,7 @@ add_content(
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -505,7 +601,7 @@ add_content(
run_code_mode_turn_with_rmcp(&server, "use exec to call rmcp echo badly", code).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -562,7 +658,7 @@ add_content("stored");
let first_request = first_follow_up.single_request();
let (first_output, first_success) =
custom_tool_output_text_and_success(&first_request, "call-1");
custom_tool_output_body_and_success(&first_request, "call-1");
assert_ne!(
first_success,
Some(false),
@@ -600,7 +696,7 @@ add_content(JSON.stringify(load("nb")));
let second_request = second_follow_up.single_request();
let (second_output, second_success) =
custom_tool_output_text_and_success(&second_request, "call-2");
custom_tool_output_body_and_success(&second_request, "call-2");
assert_ne!(
second_success,
Some(false),