Prefix code mode output with success or failure message and include error stack (#14272)

This commit is contained in:
pakrym-oai
2026-03-10 18:33:52 -07:00
committed by GitHub
parent cec211cabc
commit 24b8d443b8
5 changed files with 211 additions and 89 deletions

View File

@@ -1,6 +1,6 @@
use std::collections::HashMap;
use std::process::ExitStatus;
use std::sync::Arc;
use std::time::Duration;
use crate::client_common::tools::ToolSpec;
use crate::codex::Session;
@@ -10,6 +10,7 @@ use crate::exec_env::create_env;
use crate::features::Feature;
use crate::function_tool::FunctionCallError;
use crate::tools::ToolRouter;
use crate::tools::context::FunctionToolOutput;
use crate::tools::context::SharedTurnDiffTracker;
use crate::tools::context::ToolPayload;
use crate::tools::js_repl::resolve_compatible_node;
@@ -81,6 +82,8 @@ enum NodeToHostMessage {
content_items: Vec<JsonValue>,
stored_values: HashMap<String, JsonValue>,
#[serde(default)]
error_text: Option<String>,
#[serde(default)]
max_output_tokens_per_exec_call: Option<usize>,
},
}
@@ -105,7 +108,7 @@ pub(crate) fn instructions(config: &Config) -> Option<String> {
));
section.push_str("- Import nested tools from `tools.js`, for example `import { exec_command } from \"tools.js\"` or `import { tools } from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import { append_notebook_logs_chart } from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await exec_command(args)` remain available for compatibility. Nested tool calls resolve to their code-mode result values.\n");
section.push_str(&format!(
"- Import `{{ output_text, output_image, set_max_output_tokens_per_exec_call, store, load }}` from `@openai/code_mode` (or `\"openai/code_mode\"`). `output_text(value)` surfaces text back to the model and stringifies non-string objects with `JSON.stringify(...)` when possible. `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs. `store(key, value)` persists JSON-serializable values across `{PUBLIC_TOOL_NAME}` calls in the current session, and `load(key)` returns a cloned stored value or `undefined`. `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `{PUBLIC_TOOL_NAME}` execution; the default is `10000`. This guards the overall `{PUBLIC_TOOL_NAME}` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker.\n",
"- Import `{{ output_text, output_image, set_max_output_tokens_per_exec_call, store, load }}` from `@openai/code_mode` (or `\"openai/code_mode\"`). `output_text(value)` surfaces text back to the model and stringifies non-string objects with `JSON.stringify(...)` when possible. `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs. `store(key, value)` persists JSON-serializable values across `{PUBLIC_TOOL_NAME}` calls in the current session, and `load(key)` returns a cloned stored value or `undefined`. `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `{PUBLIC_TOOL_NAME}` execution; the default is `10000`. This guards the overall `{PUBLIC_TOOL_NAME}` output, not individual nested tool invocations. The returned content starts with a separate `Script completed` or `Script failed` text item that includes wall time. When truncation happens, the final text may include `Total output lines:` and the usual `…N tokens truncated…` marker.\n",
));
section.push_str(
"- Function tools require JSON object arguments. Freeform tools require raw strings.\n",
@@ -121,7 +124,7 @@ pub(crate) async fn execute(
turn: Arc<TurnContext>,
tracker: SharedTurnDiffTracker,
code: String,
) -> Result<Vec<FunctionCallOutputContentItem>, FunctionCallError> {
) -> Result<FunctionToolOutput, FunctionCallError> {
let exec = ExecContext {
session,
turn,
@@ -140,8 +143,9 @@ async fn execute_node(
source: String,
enabled_tools: Vec<EnabledTool>,
stored_values: HashMap<String, JsonValue>,
) -> Result<Vec<FunctionCallOutputContentItem>, String> {
) -> Result<FunctionToolOutput, String> {
let node_path = resolve_compatible_node(exec.turn.config.js_repl_node_path.as_deref()).await?;
let started_at = std::time::Instant::now();
let env = create_env(&exec.turn.shell_environment_policy, None);
let mut cmd = tokio::process::Command::new(&node_path);
@@ -190,7 +194,7 @@ async fn execute_node(
.await?;
let mut stdout_lines = BufReader::new(stdout).lines();
let mut final_content_items = None;
let mut pending_result = None;
while let Some(line) = stdout_lines
.next_line()
.await
@@ -213,6 +217,7 @@ async fn execute_node(
NodeToHostMessage::Result {
content_items,
stored_values,
error_text,
max_output_tokens_per_exec_call,
} => {
exec.session
@@ -220,8 +225,9 @@ async fn execute_node(
.code_mode_store
.replace_stored_values(stored_values)
.await;
final_content_items = Some(truncate_code_mode_result(
pending_result = Some((
output_content_items_from_json_values(content_items)?,
error_text,
max_output_tokens_per_exec_call,
));
break;
@@ -238,20 +244,39 @@ async fn execute_node(
let stderr = stderr_task
.await
.map_err(|err| format!("failed to collect {PUBLIC_TOOL_NAME} stderr: {err}"))?;
let wall_time = started_at.elapsed();
let success = status.success();
match final_content_items {
Some(content_items) if status.success() => Ok(content_items),
Some(_) => Err(format_runner_failure(
&format!("{PUBLIC_TOOL_NAME} execution failed"),
status,
&stderr,
)),
None => Err(format_runner_failure(
&format!("{PUBLIC_TOOL_NAME} runner exited without returning a result"),
status,
&stderr,
)),
let Some((mut content_items, error_text, max_output_tokens_per_exec_call)) = pending_result
else {
let message = if stderr.is_empty() {
format!("{PUBLIC_TOOL_NAME} runner exited without returning a result (status {status})")
} else {
stderr
};
return Err(message);
};
if !success {
let error_text = error_text.unwrap_or_else(|| {
if stderr.is_empty() {
format!("Process exited with status {status}")
} else {
stderr
}
});
content_items.push(FunctionCallOutputContentItem::InputText {
text: format!("Script error:\n{error_text}"),
});
}
let mut content_items =
truncate_code_mode_result(content_items, max_output_tokens_per_exec_call);
prepend_script_status(&mut content_items, success, wall_time);
Ok(FunctionToolOutput::from_content(
content_items,
Some(success),
))
}
async fn write_message(
@@ -274,15 +299,21 @@ async fn write_message(
.map_err(|err| format!("failed to flush {PUBLIC_TOOL_NAME} message: {err}"))
}
fn append_stderr(message: String, stderr: &str) -> String {
if stderr.trim().is_empty() {
return message;
}
format!("{message}\n\nnode stderr:\n{stderr}")
}
fn format_runner_failure(message: &str, status: ExitStatus, stderr: &str) -> String {
append_stderr(format!("{message} (status {status})"), stderr)
fn prepend_script_status(
content_items: &mut Vec<FunctionCallOutputContentItem>,
success: bool,
wall_time: Duration,
) {
let wall_time_seconds = ((wall_time.as_secs_f32()) * 10.0).round() / 10.0;
let header = format!(
"{}\nWall time {wall_time_seconds:.1} seconds\nOutput:\n",
if success {
"Script completed"
} else {
"Script failed"
}
);
content_items.insert(0, FunctionCallOutputContentItem::InputText { text: header });
}
fn build_source(user_code: &str, enabled_tools: &[EnabledTool]) -> Result<String, String> {
@@ -301,25 +332,17 @@ fn truncate_code_mode_result(
max_output_tokens_per_exec_call: Option<usize>,
) -> Vec<FunctionCallOutputContentItem> {
let max_output_tokens = resolve_max_tokens(max_output_tokens_per_exec_call);
let policy = TruncationPolicy::Tokens(max_output_tokens);
if items
.iter()
.all(|item| matches!(item, FunctionCallOutputContentItem::InputText { .. }))
{
let (mut truncated_items, original_token_count) =
formatted_truncate_text_content_items_with_policy(
&items,
TruncationPolicy::Tokens(max_output_tokens),
);
if let Some(original_token_count) = original_token_count
&& let Some(FunctionCallOutputContentItem::InputText { text }) =
truncated_items.first_mut()
{
*text = format!("Original token count: {original_token_count}\nOutput:\n{text}");
}
let (truncated_items, _) =
formatted_truncate_text_content_items_with_policy(&items, policy);
return truncated_items;
}
truncate_function_output_items_with_policy(&items, TruncationPolicy::Tokens(max_output_tokens))
truncate_function_output_items_with_policy(&items, policy)
}
async fn build_enabled_tools(exec: &ExecContext) -> Vec<EnabledTool> {

View File

@@ -104,6 +104,10 @@ function readContentItems(context) {
}
}
function formatErrorText(error) {
return String(error && error.stack ? error.stack : error);
}
function isValidIdentifier(name) {
return /^[A-Za-z_$][0-9A-Za-z_$]*$/.test(name);
}
@@ -378,11 +382,11 @@ async function main() {
});
process.exit(0);
} catch (error) {
process.stderr.write(`${String(error && error.stack ? error.stack : error)}\n`);
await protocol.send({
type: 'result',
content_items: readContentItems(context),
stored_values: state.storedValues,
error_text: formatErrorText(error),
max_output_tokens_per_exec_call: state.maxOutputTokensPerExecCall,
});
process.exit(1);
@@ -391,7 +395,7 @@ async function main() {
void main().catch(async (error) => {
try {
process.stderr.write(`${String(error && error.stack ? error.stack : error)}\n`);
process.stderr.write(`${formatErrorText(error)}\n`);
} finally {
process.exitCode = 1;
}

View File

@@ -48,7 +48,6 @@ impl ToolHandler for CodeModeHandler {
}
};
let content_items = code_mode::execute(session, turn, tracker, code).await?;
Ok(FunctionToolOutput::from_content(content_items, Some(true)))
code_mode::execute(session, turn, tracker, code).await
}
}

View File

@@ -1621,7 +1621,7 @@ source: /[\s\S]+/
enabled_tool_names.join(", ")
};
let description = format!(
"Runs JavaScript in a Node-backed `node:vm` context. This is a freeform tool: send raw JavaScript source text (no JSON/quotes/markdown fences). Direct tool calls remain available while `{PUBLIC_TOOL_NAME}` is enabled. Inside JavaScript, import nested tools from `tools.js`, for example `import {{ exec_command }} from \"tools.js\"` or `import {{ tools }} from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import {{ append_notebook_logs_chart }} from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await shell(args)` remain available for compatibility when the tool name is a valid JS identifier. Nested tool calls resolve to their code-mode result values. Import `{{ output_text, output_image, set_max_output_tokens_per_exec_call, store, load }}` from `\"@openai/code_mode\"` (or `\"openai/code_mode\"`); `output_text(value)` surfaces text back to the model and stringifies non-string objects when possible, `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs, `store(key, value)` persists JSON-serializable values across `{PUBLIC_TOOL_NAME}` calls in the current session, `load(key)` returns a cloned stored value or `undefined`, and `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `{PUBLIC_TOOL_NAME}` execution. The default is `10000`. This guards the overall `{PUBLIC_TOOL_NAME}` output, not individual nested tool invocations. When truncation happens, the final text uses the unified-exec style `Original token count:` / `Output:` wrapper and the usual `…N tokens truncated…` marker. Function tools require JSON object arguments. Freeform tools require raw strings. `add_content(value)` remains available for compatibility with a content item, content-item array, or string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`. Only content passed to `output_text(...)`, `output_image(...)`, or `add_content(value)` is surfaced back to the model. Enabled nested tools: {enabled_list}."
"Runs JavaScript in a Node-backed `node:vm` context. This is a freeform tool: send raw JavaScript source text (no JSON/quotes/markdown fences). Direct tool calls remain available while `{PUBLIC_TOOL_NAME}` is enabled. Inside JavaScript, import nested tools from `tools.js`, for example `import {{ exec_command }} from \"tools.js\"` or `import {{ tools }} from \"tools.js\"`. Namespaced tools are also available from `tools/<namespace...>.js`; MCP tools use `tools/mcp/<server>.js`, for example `import {{ append_notebook_logs_chart }} from \"tools/mcp/ologs.js\"`. `tools[name]` and identifier wrappers like `await shell(args)` remain available for compatibility when the tool name is a valid JS identifier. Nested tool calls resolve to their code-mode result values. Import `{{ output_text, output_image, set_max_output_tokens_per_exec_call, store, load }}` from `\"@openai/code_mode\"` (or `\"openai/code_mode\"`); `output_text(value)` surfaces text back to the model and stringifies non-string objects when possible, `output_image(imageUrl)` appends an `input_image` content item for `http(s)` or `data:` URLs, `store(key, value)` persists JSON-serializable values across `{PUBLIC_TOOL_NAME}` calls in the current session, `load(key)` returns a cloned stored value or `undefined`, and `set_max_output_tokens_per_exec_call(value)` sets the token budget used to truncate the final Rust-side result of the current `{PUBLIC_TOOL_NAME}` execution. The default is `10000`. This guards the overall `{PUBLIC_TOOL_NAME}` output, not individual nested tool invocations. The returned content starts with a separate `Script completed` or `Script failed` text item that includes wall time. When truncation happens, the final text may include `Total output lines:` and the usual `…N tokens truncated…` marker. Function tools require JSON object arguments. Freeform tools require raw strings. `add_content(value)` remains available for compatibility with a content item, content-item array, or string. Structured nested-tool results should be converted to text first, for example with `JSON.stringify(...)`. Only content passed to `output_text(...)`, `output_image(...)`, or `add_content(value)` is surfaced back to the model. Enabled nested tools: {enabled_list}."
);
ToolSpec::Freeform(FreeformTool {

View File

@@ -24,14 +24,35 @@ use std::fs;
use std::time::Duration;
use wiremock::MockServer;
fn custom_tool_output_text_and_success(
fn custom_tool_output_items(req: &ResponsesRequest, call_id: &str) -> Vec<Value> {
req.custom_tool_call_output(call_id)
.get("output")
.and_then(Value::as_array)
.expect("custom tool output should be serialized as content items")
.clone()
}
fn text_item(items: &[Value], index: usize) -> &str {
items[index]
.get("text")
.and_then(Value::as_str)
.expect("content item should be input_text")
}
fn custom_tool_output_body_and_success(
req: &ResponsesRequest,
call_id: &str,
) -> (String, Option<bool>) {
let (output, success) = req
let (_, success) = req
.custom_tool_call_output_content_and_success(call_id)
.expect("custom tool output should be present");
(output.unwrap_or_default(), success)
let items = custom_tool_output_items(req, call_id);
let output = items
.iter()
.skip(1)
.filter_map(|item| item.get("text").and_then(Value::as_str))
.collect();
(output, success)
}
async fn run_code_mode_turn(
@@ -152,13 +173,16 @@ add_content(JSON.stringify(await exec_command({ cmd: "printf code_mode_exec_mark
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"exec call failed unexpectedly: {output}"
let items = custom_tool_output_items(&req, "call-1");
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
let parsed: Value = serde_json::from_str(&output)?;
let parsed: Value = serde_json::from_str(text_item(&items, 1))?;
assert!(
parsed
.get("chunk_id")
@@ -201,22 +225,66 @@ add_content(JSON.stringify(await exec_command({
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"exec call failed unexpectedly: {output}"
let items = custom_tool_output_items(&req, "call-1");
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
let expected_pattern = r#"(?sx)
\A
Original\ token\ count:\ \d+\n
Output:\n
Total\ output\ lines:\ 1\n
\n
\{"chunk_id".*…\d+\ tokens\ truncated….*
.*…\d+\ tokens\ truncated….*
\z
"#;
assert_regex_match(expected_pattern, &output);
assert_regex_match(expected_pattern, text_item(&items, 1));
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_returns_accumulated_output_when_script_fails() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
&server,
"use code_mode to surface script failures",
r#"
add_content("before crash");
add_content("still before crash");
throw new Error("boom");
"#,
false,
)
.await?;
let req = second_mock.single_request();
let items = custom_tool_output_items(&req, "call-1");
assert_eq!(items.len(), 4);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script failed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert_eq!(text_item(&items, 1), "before crash");
assert_eq!(text_item(&items, 2), "still before crash");
assert_regex_match(
r#"(?sx)
\A
Script\ error:\n
Error:\ boom\n
(?:\s+at\ .+\n?)+
\z
"#,
text_item(&items, 3),
);
Ok(())
}
@@ -239,7 +307,7 @@ output_text({ json: true });
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -270,14 +338,25 @@ output_text(circular);
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = req
.custom_tool_call_output_content_and_success("call-1")
.expect("custom tool output should be present");
assert_ne!(
success,
Some(true),
"circular stringify unexpectedly succeeded"
);
assert!(output.contains("exec execution failed"));
assert!(output.contains("Converting circular structure to JSON"));
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script failed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert!(text_item(&items, 1).contains("Script error:"));
assert!(text_item(&items, 1).contains("Converting circular structure to JSON"));
Ok(())
}
@@ -301,28 +380,34 @@ output_image("data:image/png;base64,AAA");
.await?;
let req = second_mock.single_request();
let (_, success) = custom_tool_output_text_and_success(&req, "call-1");
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"code_mode image output failed unexpectedly"
);
assert_eq!(items.len(), 3);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
assert_eq!(
req.custom_tool_call_output("call-1"),
items[1],
serde_json::json!({
"type": "custom_tool_call_output",
"call_id": "call-1",
"output": [
{
"type": "input_image",
"image_url": "https://example.com/image.jpg"
},
{
"type": "input_image",
"image_url": "data:image/png;base64,AAA"
}
]
})
"type": "input_image",
"image_url": "https://example.com/image.jpg"
}),
);
assert_eq!(
items[2],
serde_json::json!({
"type": "input_image",
"image_url": "data:image/png;base64,AAA"
}),
);
Ok(())
@@ -345,11 +430,22 @@ async fn code_mode_can_apply_patch_via_nested_tool() -> Result<()> {
run_code_mode_turn(&server, "use exec to run apply_patch", &code, true).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = req
.custom_tool_call_output_content_and_success("call-1")
.expect("custom tool output should be present");
assert_ne!(
success,
Some(false),
"exec apply_patch call failed unexpectedly: {output}"
"exec apply_patch call failed unexpectedly: {items:?}"
);
assert_eq!(items.len(), 2);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, 0),
);
let file_path = test.cwd_path().join(file_name);
@@ -381,7 +477,7 @@ add_content(
run_code_mode_turn_with_rmcp(&server, "use exec to run the rmcp echo tool", code).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -420,7 +516,7 @@ add_content(
run_code_mode_turn_with_rmcp(&server, "use exec to run the rmcp echo tool", code).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -464,7 +560,7 @@ add_content(
.await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -505,7 +601,7 @@ add_content(
run_code_mode_turn_with_rmcp(&server, "use exec to call rmcp echo badly", code).await?;
let req = second_mock.single_request();
let (output, success) = custom_tool_output_text_and_success(&req, "call-1");
let (output, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
@@ -562,7 +658,7 @@ add_content("stored");
let first_request = first_follow_up.single_request();
let (first_output, first_success) =
custom_tool_output_text_and_success(&first_request, "call-1");
custom_tool_output_body_and_success(&first_request, "call-1");
assert_ne!(
first_success,
Some(false),
@@ -600,7 +696,7 @@ add_content(JSON.stringify(load("nb")));
let second_request = second_follow_up.single_request();
let (second_output, second_success) =
custom_tool_output_text_and_success(&second_request, "call-2");
custom_tool_output_body_and_success(&second_request, "call-2");
assert_ne!(
second_success,
Some(false),