Compare commits

...

1 Commits

Author SHA1 Message Date
Curtis 'Fjord' Hawthorne
b42b2ff9ec Support audio input 2026-05-15 22:11:50 -07:00
37 changed files with 2112 additions and 110 deletions

View File

@@ -997,6 +997,26 @@
],
"title": "InputImageFunctionCallOutputContentItem",
"type": "object"
},
{
"properties": {
"input_audio": {
"$ref": "#/definitions/InputAudio"
},
"type": {
"enum": [
"input_audio"
],
"title": "InputAudioFunctionCallOutputContentItemType",
"type": "string"
}
},
"required": [
"input_audio",
"type"
],
"title": "InputAudioFunctionCallOutputContentItem",
"type": "object"
}
]
},
@@ -1111,6 +1131,21 @@
],
"type": "object"
},
"InputAudio": {
"properties": {
"data": {
"type": "string"
},
"format": {
"type": "string"
}
},
"required": [
"data",
"format"
],
"type": "object"
},
"ListMcpServerStatusParams": {
"properties": {
"cursor": {

View File

@@ -9137,6 +9137,26 @@
],
"title": "InputImageFunctionCallOutputContentItem",
"type": "object"
},
{
"properties": {
"input_audio": {
"$ref": "#/definitions/v2/InputAudio"
},
"type": {
"enum": [
"input_audio"
],
"title": "InputAudioFunctionCallOutputContentItemType",
"type": "string"
}
},
"required": [
"input_audio",
"type"
],
"title": "InputAudioFunctionCallOutputContentItem",
"type": "object"
}
]
},
@@ -9925,6 +9945,21 @@
],
"type": "string"
},
"InputAudio": {
"properties": {
"data": {
"type": "string"
},
"format": {
"type": "string"
}
},
"required": [
"data",
"format"
],
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
@@ -9941,6 +9976,13 @@
"image"
],
"type": "string"
},
{
"description": "Audio content included in tool payloads.",
"enum": [
"audio"
],
"type": "string"
}
]
},

View File

@@ -5526,6 +5526,26 @@
],
"title": "InputImageFunctionCallOutputContentItem",
"type": "object"
},
{
"properties": {
"input_audio": {
"$ref": "#/definitions/InputAudio"
},
"type": {
"enum": [
"input_audio"
],
"title": "InputAudioFunctionCallOutputContentItemType",
"type": "string"
}
},
"required": [
"input_audio",
"type"
],
"title": "InputAudioFunctionCallOutputContentItem",
"type": "object"
}
]
},
@@ -6474,6 +6494,21 @@
"title": "InitializeParams",
"type": "object"
},
"InputAudio": {
"properties": {
"data": {
"type": "string"
},
"format": {
"type": "string"
}
},
"required": [
"data",
"format"
],
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
@@ -6490,6 +6525,13 @@
"image"
],
"type": "string"
},
{
"description": "Audio content included in tool payloads.",
"enum": [
"audio"
],
"type": "string"
}
]
},

View File

@@ -17,6 +17,13 @@
"image"
],
"type": "string"
},
{
"description": "Audio content included in tool payloads.",
"enum": [
"audio"
],
"type": "string"
}
]
},

View File

@@ -140,6 +140,26 @@
],
"title": "InputImageFunctionCallOutputContentItem",
"type": "object"
},
{
"properties": {
"input_audio": {
"$ref": "#/definitions/InputAudio"
},
"type": {
"enum": [
"input_audio"
],
"title": "InputAudioFunctionCallOutputContentItemType",
"type": "string"
}
},
"required": [
"input_audio",
"type"
],
"title": "InputAudioFunctionCallOutputContentItem",
"type": "object"
}
]
},
@@ -150,6 +170,21 @@
],
"type": "string"
},
"InputAudio": {
"properties": {
"data": {
"type": "string"
},
"format": {
"type": "string"
}
},
"required": [
"data",
"format"
],
"type": "object"
},
"LocalShellAction": {
"oneOf": [
{

View File

@@ -199,6 +199,26 @@
],
"title": "InputImageFunctionCallOutputContentItem",
"type": "object"
},
{
"properties": {
"input_audio": {
"$ref": "#/definitions/InputAudio"
},
"type": {
"enum": [
"input_audio"
],
"title": "InputAudioFunctionCallOutputContentItemType",
"type": "string"
}
},
"required": [
"input_audio",
"type"
],
"title": "InputAudioFunctionCallOutputContentItem",
"type": "object"
}
]
},
@@ -209,6 +229,21 @@
],
"type": "string"
},
"InputAudio": {
"properties": {
"data": {
"type": "string"
},
"format": {
"type": "string"
}
},
"required": [
"data",
"format"
],
"type": "object"
},
"LocalShellAction": {
"oneOf": [
{

View File

@@ -2,9 +2,10 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
import type { ImageDetail } from "./ImageDetail";
import type { InputAudio } from "./InputAudio";
/**
* Responses API compatible content items that can be returned by a tool call.
* This is a subset of ContentItem with the types we support as function call outputs.
*/
export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, };
export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, } | { "type": "input_audio", input_audio: InputAudio, };

View File

@@ -0,0 +1,5 @@
// GENERATED CODE! DO NOT MODIFY BY HAND!
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
export type InputAudio = { data: string, format: string, };

View File

@@ -5,4 +5,4 @@
/**
* Canonical user-input modality tags advertised by a model.
*/
export type InputModality = "text" | "image";
export type InputModality = "text" | "image" | "audio";

View File

@@ -36,6 +36,7 @@ export type { ImageDetail } from "./ImageDetail";
export type { InitializeCapabilities } from "./InitializeCapabilities";
export type { InitializeParams } from "./InitializeParams";
export type { InitializeResponse } from "./InitializeResponse";
export type { InputAudio } from "./InputAudio";
export type { InputModality } from "./InputModality";
export type { InternalSessionSource } from "./InternalSessionSource";
export type { LocalShellAction } from "./LocalShellAction";

View File

@@ -9,7 +9,7 @@ use crate::PUBLIC_TOOL_NAME;
const MAX_JS_SAFE_INTEGER: u64 = (1_u64 << 53) - 1;
const DEFERRED_NESTED_TOOLS_GUIDANCE: &str = r#"Some nested MCP/app tools may be omitted from this description. They are still available on the global `tools` object and listed in `ALL_TOOLS`.
To find one, filter `ALL_TOOLS` by `name` and `description`."#;
const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/compose tool calls
const EXEC_DESCRIPTION_TEMPLATE_PREFIX: &str = r#"Run JavaScript code to orchestrate/compose tool calls
- Evaluates the provided JavaScript code in a fresh V8 isolate as an async module.
- All nested tools are available on the global `tools` object, for example `await tools.exec_command(...)`. Tool names are exposed as normalized JavaScript identifiers, for example `await tools.mcp__ologs__get_profile(...)`.
- Nested tool methods take either a string or an object as their input argument.
@@ -24,8 +24,9 @@ const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/co
- Global helpers:
- `exit()`: Immediately ends the current script successfully (like an early return from the top level).
- `text(value: string | number | boolean | undefined | null)`: Appends a text item. Non-string values are stringified with `JSON.stringify(...)` when possible.
- `image(imageUrlOrItem: string | { image_url: string; detail?: "high" | "original" | null } | ImageContent, detail?: "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument.
- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
- `image(imageUrlOrItem: string | { image_url: string; detail?: "high" | "original" | null } | ImageContent, detail?: "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument."#;
const AUDIO_HELPER_DESCRIPTION: &str = r#"- `audio(audioItem: { data: string; format?: string | null; mimeType?: string | null; mime_type?: string | null } | AudioContent)`: Appends an audio item. `data` can be raw base64 audio or a base64-encoded `data:audio/...` URL. To forward an MCP tool audio block, pass an individual `AudioContent` block from `result.content`, for example `audio(result.content[0])`."#;
const EXEC_DESCRIPTION_TEMPLATE_SUFFIX: &str = r#"- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
- `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing.
- `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`.
- `setTimeout(callback: () => void, delayMs?: number)`: schedules a callback to run later and returns a timeout id. Pending timeouts do not keep `exec` alive by themselves; await an explicit promise if you need to wait for one.
@@ -41,7 +42,7 @@ const WAIT_DESCRIPTION_TEMPLATE: &str = r#"- Use `wait` only after `exec` return
- If the cell is still running, `wait` may yield again with the same `cell_id`.
- If the cell has already finished, `wait` returns the completed result and closes the cell."#;
// Based off of https://modelcontextprotocol.io/specification/draft/schema#calltoolresult
const MCP_TYPESCRIPT_PREAMBLE: &str = r#"type Role = "user" | "assistant";
const MCP_TYPESCRIPT_PREAMBLE_PREFIX: &str = r#"type Role = "user" | "assistant";
type MetaObject = Record<string, unknown>;
type Annotations = {
audience?: Role[];
@@ -79,14 +80,16 @@ type ImageContent = {
annotations?: Annotations;
_meta?: MetaObject;
};
type AudioContent = {
"#;
const MCP_AUDIO_CONTENT_TYPE: &str = r#"type AudioContent = {
type: "audio";
data: string;
mimeType: string;
annotations?: Annotations;
_meta?: MetaObject;
};
type ResourceLink = {
"#;
const MCP_TYPESCRIPT_PREAMBLE_SUFFIX: &str = r#"type ResourceLink = {
icons?: Icon[];
name: string;
title?: string;
@@ -106,8 +109,10 @@ type EmbeddedResource = {
};
type ContentBlock =
| TextContent
| ImageContent
| AudioContent
| ImageContent"#;
const MCP_AUDIO_CONTENT_BLOCK_VARIANT: &str = r#"
| AudioContent"#;
const MCP_TYPESCRIPT_PREAMBLE_END: &str = r#"
| ResourceLink
| EmbeddedResource;
type CallToolResult<TStructured = { [key: string]: unknown }> = {
@@ -143,6 +148,13 @@ pub struct ToolNamespaceDescription {
pub description: String,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct ExecToolDescriptionOptions {
pub code_mode_only: bool,
pub deferred_tools_available: bool,
pub supports_audio_input: bool,
}
#[derive(Debug, Default, Deserialize, PartialEq, Eq)]
#[serde(deny_unknown_fields)]
struct CodeModeExecPragma {
@@ -250,15 +262,21 @@ pub fn is_code_mode_nested_tool(tool_name: &str) -> bool {
pub fn build_exec_tool_description(
enabled_tools: &[ToolDefinition],
namespace_descriptions: &BTreeMap<String, ToolNamespaceDescription>,
code_mode_only: bool,
deferred_tools_available: bool,
options: ExecToolDescriptionOptions,
) -> String {
let mut sections = Vec::new();
sections.push(EXEC_DESCRIPTION_TEMPLATE.to_string());
if deferred_tools_available {
let mut exec_description = String::from(EXEC_DESCRIPTION_TEMPLATE_PREFIX);
if options.supports_audio_input {
exec_description.push('\n');
exec_description.push_str(AUDIO_HELPER_DESCRIPTION);
}
exec_description.push('\n');
exec_description.push_str(EXEC_DESCRIPTION_TEMPLATE_SUFFIX);
sections.push(exec_description);
if options.deferred_tools_available {
sections.push(DEFERRED_NESTED_TOOLS_GUIDANCE.to_string());
}
if !code_mode_only {
if !options.code_mode_only {
return sections.join("\n\n");
}
@@ -305,8 +323,18 @@ pub fn build_exec_tool_description(
}
if has_mcp_tools {
let mut mcp_typescript_preamble = String::from(MCP_TYPESCRIPT_PREAMBLE_PREFIX);
if options.supports_audio_input {
mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_TYPE);
}
mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_SUFFIX);
if options.supports_audio_input {
mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_BLOCK_VARIANT);
}
mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_END);
sections.push(format!(
"Shared MCP Types:\n```ts\n{MCP_TYPESCRIPT_PREAMBLE}\n```"
"Shared MCP Types:\n```ts\n{mcp_typescript_preamble}\n```"
));
}
let nested_tool_reference = nested_tool_sections.join("\n\n");
@@ -706,6 +734,7 @@ fn render_json_schema_literal(value: &JsonValue) -> String {
#[cfg(test)]
mod tests {
use super::CodeModeToolKind;
use super::ExecToolDescriptionOptions;
use super::ParsedExecSource;
use super::ToolDefinition;
use super::ToolNamespaceDescription;
@@ -863,8 +892,11 @@ mod tests {
output_schema: None,
}],
&BTreeMap::new(),
/*code_mode_only*/ true,
/*deferred_tools_available*/ false,
ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert!(description.contains(
"### `foo`
@@ -878,13 +910,41 @@ bar"
let description = build_exec_tool_description(
&[],
&BTreeMap::new(),
/*code_mode_only*/ false,
/*deferred_tools_available*/ false,
ExecToolDescriptionOptions {
code_mode_only: false,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert!(description.contains("`setTimeout(callback: () => void, delayMs?: number)`"));
assert!(description.contains("`clearTimeout(timeoutId?: number)`"));
}
#[test]
fn exec_description_gates_audio_helper_on_audio_input_support() {
let unsupported_description = build_exec_tool_description(
&[],
&BTreeMap::new(),
ExecToolDescriptionOptions {
code_mode_only: false,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert!(!unsupported_description.contains("`audio(audioItem"));
let supported_description = build_exec_tool_description(
&[],
&BTreeMap::new(),
ExecToolDescriptionOptions {
code_mode_only: false,
deferred_tools_available: false,
supports_audio_input: true,
},
);
assert!(supported_description.contains("`audio(audioItem"));
}
#[test]
fn code_mode_only_description_groups_namespace_instructions_once() {
let namespace_descriptions = BTreeMap::from([(
@@ -930,8 +990,11 @@ bar"
},
],
&namespace_descriptions,
/*code_mode_only*/ true,
/*deferred_tools_available*/ false,
ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert_eq!(description.matches("## mcp__sample").count(), 1);
assert!(description.contains("## mcp__sample\nShared namespace guidance."));
@@ -970,8 +1033,11 @@ bar"
}))),
}],
&namespace_descriptions,
/*code_mode_only*/ true,
/*deferred_tools_available*/ false,
ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert!(!description.contains("## mcp__sample"));
@@ -1069,8 +1135,11 @@ bar"
},
],
&BTreeMap::new(),
/*code_mode_only*/ true,
/*deferred_tools_available*/ false,
ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert_eq!(
@@ -1082,13 +1151,60 @@ bar"
assert_eq!(description.matches("Shared MCP Types:").count(), 1);
}
#[test]
fn code_mode_only_description_gates_mcp_audio_type_on_audio_input_support() {
let tools = vec![ToolDefinition {
name: "mcp__sample__audio".to_string(),
tool_name: ToolName::namespaced("mcp__sample__", "audio"),
description: "Audio tool".to_string(),
kind: CodeModeToolKind::Function,
input_schema: Some(json!({
"type": "object",
"properties": {},
"additionalProperties": false
})),
output_schema: Some(mcp_call_tool_result_schema(json!({
"type": "object",
"properties": {},
"additionalProperties": false
}))),
}];
let unsupported_description = build_exec_tool_description(
&tools,
&BTreeMap::new(),
ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
},
);
assert!(!unsupported_description.contains("type AudioContent"));
assert!(!unsupported_description.contains("| AudioContent"));
let supported_description = build_exec_tool_description(
&tools,
&BTreeMap::new(),
ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: true,
},
);
assert!(supported_description.contains("type AudioContent"));
assert!(supported_description.contains("| AudioContent"));
}
#[test]
fn exec_description_mentions_deferred_nested_tools_when_available() {
let description = build_exec_tool_description(
&[],
&BTreeMap::new(),
/*code_mode_only*/ false,
/*deferred_tools_available*/ true,
ExecToolDescriptionOptions {
code_mode_only: false,
deferred_tools_available: true,
supports_audio_input: false,
},
);
assert!(description.contains("Some nested MCP/app tools may be omitted"));

View File

@@ -5,6 +5,7 @@ mod service;
pub use description::CODE_MODE_PRAGMA_PREFIX;
pub use description::CodeModeToolKind;
pub use description::ExecToolDescriptionOptions;
pub use description::ToolDefinition;
pub use description::ToolNamespaceDescription;
pub use description::augment_tool_definition;
@@ -18,6 +19,7 @@ pub use description::render_json_schema_to_typescript;
pub use response::DEFAULT_IMAGE_DETAIL;
pub use response::FunctionCallOutputContentItem;
pub use response::ImageDetail;
pub use response::InputAudio;
pub use runtime::CodeModeNestedToolCall;
pub use runtime::DEFAULT_EXEC_YIELD_TIME_MS;
pub use runtime::DEFAULT_MAX_OUTPUT_TOKENS_PER_EXEC_CALL;

View File

@@ -21,4 +21,13 @@ pub enum FunctionCallOutputContentItem {
#[serde(default, skip_serializing_if = "Option::is_none")]
detail: Option<ImageDetail>,
},
InputAudio {
input_audio: InputAudio,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct InputAudio {
pub data: String,
pub format: String,
}

View File

@@ -5,6 +5,7 @@ use super::RuntimeEvent;
use super::RuntimeState;
use super::timers;
use super::value::json_to_v8;
use super::value::normalize_output_audio;
use super::value::normalize_output_image;
use super::value::serialize_output_text;
use super::value::throw_type_error;
@@ -129,6 +130,26 @@ pub(super) fn image_callback(
retval.set(v8::undefined(scope).into());
}
pub(super) fn audio_callback(
scope: &mut v8::PinScope<'_, '_>,
args: v8::FunctionCallbackArguments,
mut retval: v8::ReturnValue<v8::Value>,
) {
let value = if args.length() == 0 {
v8::undefined(scope).into()
} else {
args.get(0)
};
let audio_item = match normalize_output_audio(scope, value) {
Ok(audio_item) => audio_item,
Err(()) => return,
};
if let Some(state) = scope.get_slot::<RuntimeState>() {
let _ = state.event_tx.send(RuntimeEvent::ContentItem(audio_item));
}
retval.set(v8::undefined(scope).into());
}
pub(super) fn store_callback(
scope: &mut v8::PinScope<'_, '_>,
args: v8::FunctionCallbackArguments,

View File

@@ -1,4 +1,5 @@
use super::RuntimeState;
use super::callbacks::audio_callback;
use super::callbacks::clear_timeout_callback;
use super::callbacks::exit_callback;
use super::callbacks::image_callback;
@@ -23,6 +24,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
let set_timeout = helper_function(scope, "setTimeout", set_timeout_callback)?;
let text = helper_function(scope, "text", text_callback)?;
let image = helper_function(scope, "image", image_callback)?;
let audio = helper_function(scope, "audio", audio_callback)?;
let store = helper_function(scope, "store", store_callback)?;
let load = helper_function(scope, "load", load_callback)?;
let notify = helper_function(scope, "notify", notify_callback)?;
@@ -35,6 +37,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
set_global(scope, global, "setTimeout", set_timeout.into())?;
set_global(scope, global, "text", text.into())?;
set_global(scope, global, "image", image.into())?;
set_global(scope, global, "audio", audio.into())?;
set_global(scope, global, "store", store.into())?;
set_global(scope, global, "load", load.into())?;
set_global(scope, global, "notify", notify.into())?;

View File

@@ -3,8 +3,10 @@ use serde_json::Value as JsonValue;
use crate::response::DEFAULT_IMAGE_DETAIL;
use crate::response::FunctionCallOutputContentItem;
use crate::response::ImageDetail;
use crate::response::InputAudio;
const IMAGE_HELPER_EXPECTS_MESSAGE: &str = "image expects a non-empty image URL string, an object with image_url and optional detail, or a raw MCP image block";
const AUDIO_HELPER_EXPECTS_MESSAGE: &str = "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block";
const CODEX_IMAGE_DETAIL_META_KEY: &str = "codex/imageDetail";
pub(super) fn serialize_output_text(
@@ -93,6 +95,35 @@ pub(super) fn normalize_output_image(
}
}
pub(super) fn normalize_output_audio(
scope: &mut v8::PinScope<'_, '_>,
value: v8::Local<'_, v8::Value>,
) -> Result<FunctionCallOutputContentItem, ()> {
let result = (|| -> Result<FunctionCallOutputContentItem, String> {
if !value.is_object() || value.is_array() {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
}
let object = v8::Local::<v8::Object>::try_from(value)
.map_err(|_| AUDIO_HELPER_EXPECTS_MESSAGE.to_string())?;
let input_audio = if let Some(audio) = parse_non_mcp_output_audio(scope, object)? {
audio
} else {
parse_mcp_output_audio(scope, value)?
};
Ok(FunctionCallOutputContentItem::InputAudio { input_audio })
})();
match result {
Ok(item) => Ok(item),
Err(error_text) => {
throw_type_error(scope, &error_text);
Err(())
}
}
}
fn parse_non_mcp_output_image(
scope: &mut v8::PinScope<'_, '_>,
object: v8::Local<'_, v8::Object>,
@@ -161,6 +192,90 @@ fn parse_mcp_output_image(
Ok((image_url, detail))
}
fn parse_non_mcp_output_audio(
scope: &mut v8::PinScope<'_, '_>,
object: v8::Local<'_, v8::Object>,
) -> Result<Option<InputAudio>, String> {
let data_key = v8::String::new(scope, "data")
.ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
let Some(data) = object.get(scope, data_key.into()) else {
return Ok(None);
};
if data.is_undefined() {
return Ok(None);
}
if !data.is_string() {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
}
let data = data.to_rust_string_lossy(scope);
let format = optional_string_property(scope, object, "format")?;
let mime_type = optional_string_property(scope, object, "mimeType")?
.or(optional_string_property(scope, object, "mime_type")?);
let Some(input_audio) = codex_protocol::models::input_audio_from_data(
&data,
format.as_deref(),
mime_type.as_deref(),
) else {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
};
Ok(Some(InputAudio {
data: input_audio.data,
format: input_audio.format,
}))
}
fn parse_mcp_output_audio(
scope: &mut v8::PinScope<'_, '_>,
value: v8::Local<'_, v8::Value>,
) -> Result<InputAudio, String> {
let Some(result) = v8_value_to_json(scope, value)? else {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
};
let JsonValue::Object(result) = result else {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
};
let Some(item_type) = result.get("type").and_then(JsonValue::as_str) else {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
};
if item_type != "audio" {
return Err(format!(
"audio only accepts MCP audio blocks, got \"{item_type}\""
));
}
let data = result
.get("data")
.and_then(JsonValue::as_str)
.ok_or_else(|| "audio expected MCP audio data".to_string())?;
let mime_type = result
.get("mimeType")
.or_else(|| result.get("mime_type"))
.and_then(JsonValue::as_str);
let Some(input_audio) =
codex_protocol::models::input_audio_from_data(data, /*format*/ None, mime_type)
else {
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
};
Ok(InputAudio {
data: input_audio.data,
format: input_audio.format,
})
}
fn optional_string_property(
scope: &mut v8::PinScope<'_, '_>,
object: v8::Local<'_, v8::Object>,
name: &str,
) -> Result<Option<String>, String> {
let key = v8::String::new(scope, name)
.ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
match object.get(scope, key.into()) {
Some(value) if value.is_string() => Ok(Some(value.to_rust_string_lossy(scope))),
Some(value) if value.is_null() || value.is_undefined() => Ok(None),
Some(_) => Err(format!("{name} must be a string when provided")),
None => Ok(None),
}
}
fn parse_image_detail_value<'s>(
scope: &mut v8::PinScope<'s, '_>,
value: Option<v8::Local<'s, v8::Value>>,

View File

@@ -703,6 +703,7 @@ mod tests {
use super::run_session_control;
use crate::CodeModeToolKind;
use crate::FunctionCallOutputContentItem;
use crate::InputAudio;
use crate::ToolDefinition;
use crate::runtime::ExecuteRequest;
use crate::runtime::ExecuteToPendingOutcome;
@@ -1230,6 +1231,7 @@ text(formatter.format(new Date("2025-01-02T03:04:05Z")));
const returnsUndefined = [
text("first"),
image("https://example.com/image.jpg"),
audio({ data: "BASE64", format: "wav" }),
notify("ping"),
].map((value) => value === undefined);
text(JSON.stringify(returnsUndefined));
@@ -1253,8 +1255,14 @@ text(JSON.stringify(returnsUndefined));
image_url: "https://example.com/image.jpg".to_string(),
detail: Some(crate::DEFAULT_IMAGE_DETAIL),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "BASE64".to_string(),
format: "wav".to_string(),
},
},
FunctionCallOutputContentItem::InputText {
text: "[true,true,true]".to_string(),
text: "[true,true,true,true]".to_string(),
},
],
stored_values: HashMap::new(),
@@ -1441,6 +1449,147 @@ image({
);
}
#[tokio::test]
async fn audio_helper_accepts_explicit_object() {
let service = CodeModeService::new();
let response = service
.execute(ExecuteRequest {
source: r#"audio({ data: "BASE64", format: "wav" });"#.to_string(),
yield_time_ms: None,
..execute_request("")
})
.await
.unwrap();
assert_eq!(
response,
RuntimeResponse::Result {
cell_id: "1".to_string(),
content_items: vec![FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "BASE64".to_string(),
format: "wav".to_string(),
},
}],
stored_values: HashMap::new(),
error_text: None,
}
);
}
#[tokio::test]
async fn audio_helper_strips_data_url_and_derives_format() {
let service = CodeModeService::new();
let response = service
.execute(ExecuteRequest {
source: r#"audio({ data: "data:audio/mpeg;base64,BASE64" });"#.to_string(),
yield_time_ms: None,
..execute_request("")
})
.await
.unwrap();
assert_eq!(
response,
RuntimeResponse::Result {
cell_id: "1".to_string(),
content_items: vec![FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "BASE64".to_string(),
format: "mp3".to_string(),
},
}],
stored_values: HashMap::new(),
error_text: None,
}
);
}
#[tokio::test]
async fn audio_helper_accepts_raw_mcp_audio_block() {
let service = CodeModeService::new();
let response = service
.execute(ExecuteRequest {
source: r#"audio({ type: "audio", data: "BASE64", mimeType: "audio/ogg" });"#
.to_string(),
yield_time_ms: None,
..execute_request("")
})
.await
.unwrap();
assert_eq!(
response,
RuntimeResponse::Result {
cell_id: "1".to_string(),
content_items: vec![FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "BASE64".to_string(),
format: "ogg".to_string(),
},
}],
stored_values: HashMap::new(),
error_text: None,
}
);
}
#[tokio::test]
async fn audio_helper_rejects_bare_string() {
let service = CodeModeService::new();
let response = service
.execute(ExecuteRequest {
source: r#"audio("BASE64");"#.to_string(),
yield_time_ms: None,
..execute_request("")
})
.await
.unwrap();
assert_eq!(
response,
RuntimeResponse::Result {
cell_id: "1".to_string(),
content_items: Vec::new(),
stored_values: HashMap::new(),
error_text: Some(
"audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
),
}
);
}
#[tokio::test]
async fn audio_helper_rejects_non_audio_mime_type() {
let service = CodeModeService::new();
let response = service
.execute(ExecuteRequest {
source: r#"audio({ data: "BASE64", mimeType: "application/octet-stream" });"#
.to_string(),
yield_time_ms: None,
..execute_request("")
})
.await
.unwrap();
assert_eq!(
response,
RuntimeResponse::Result {
cell_id: "1".to_string(),
content_items: Vec::new(),
stored_values: HashMap::new(),
error_text: Some(
"audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
),
}
);
}
#[tokio::test]
async fn wait_reports_missing_cell_separately_from_runtime_results() {
let service = CodeModeService::new();

View File

@@ -113,9 +113,8 @@ impl ContextManager {
}
/// Returns the history prepared for sending to the model. This applies a proper
/// normalization and drops un-suited items. When `input_modalities` does not
/// include `InputModality::Image`, images are stripped from messages and tool
/// outputs.
/// normalization and drops un-suited items. Unsupported media content is
/// stripped from messages and tool outputs according to `input_modalities`.
pub(crate) fn for_prompt(mut self, input_modalities: &[InputModality]) -> Vec<ResponseItem> {
self.normalize_history(input_modalities);
self.items
@@ -365,8 +364,8 @@ impl ContextManager {
// all outputs must have a corresponding function/tool call
normalize::remove_orphan_outputs(&mut self.items);
// strip images when model does not support them
normalize::strip_images_when_unsupported(input_modalities, &mut self.items);
// strip unsupported media content before sending history to the model
normalize::strip_unsupported_media_content(input_modalities, &mut self.items);
}
fn process_item(&self, item: &ResponseItem, policy: TruncationPolicy) -> ResponseItem {

View File

@@ -10,6 +10,7 @@ use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ImageDetail;
use codex_protocol::models::InputAudio;
use codex_protocol::models::LocalShellAction;
use codex_protocol::models::LocalShellExecAction;
use codex_protocol::models::LocalShellStatus;
@@ -513,6 +514,85 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
}
}
#[test]
fn for_prompt_strips_audio_when_model_does_not_support_audio() {
let items = vec![
ResponseItem::FunctionCall {
id: None,
name: "audio_tool".to_string(),
namespace: None,
arguments: "{}".to_string(),
call_id: "call-1".to_string(),
},
ResponseItem::FunctionCallOutput {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text: "audio result".to_string(),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "UklGRg==".to_string(),
format: "wav".to_string(),
},
},
]),
},
];
let history = create_history_with_items(items);
let default_modalities = default_input_modalities();
let stripped = history.clone().for_prompt(&default_modalities);
assert_eq!(
stripped,
vec![
ResponseItem::FunctionCall {
id: None,
name: "audio_tool".to_string(),
namespace: None,
arguments: "{}".to_string(),
call_id: "call-1".to_string(),
},
ResponseItem::FunctionCallOutput {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text: "audio result".to_string(),
},
FunctionCallOutputContentItem::InputText {
text: "audio content omitted because you do not support audio input"
.to_string(),
},
]),
},
]
);
let audio_modalities = vec![
InputModality::Text,
InputModality::Image,
InputModality::Audio,
];
let with_audio = history.for_prompt(&audio_modalities);
assert_eq!(
with_audio[1],
ResponseItem::FunctionCallOutput {
call_id: "call-1".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text: "audio result".to_string(),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "UklGRg==".to_string(),
format: "wav".to_string(),
},
},
]),
}
);
}
#[test]
fn for_prompt_preserves_image_generation_calls_when_images_are_supported() {
let history = create_history_with_items(vec![
@@ -1048,6 +1128,46 @@ fn record_items_truncates_function_call_output_content() {
}
}
#[test]
fn record_items_omits_over_budget_audio_content() {
let mut history = ContextManager::new();
let audio_data = "A".repeat(1_000);
let item = ResponseItem::FunctionCallOutput {
call_id: "call-audio".to_string(),
output: FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: audio_data.clone(),
format: "wav".to_string(),
},
},
]),
};
history.record_items([&item], TruncationPolicy::Bytes(32));
assert_eq!(history.items.len(), 1);
match &history.items[0] {
ResponseItem::FunctionCallOutput { output, .. } => {
assert_eq!(
output,
&FunctionCallOutputPayload::from_content_items(vec![
FunctionCallOutputContentItem::InputText {
text:
"[omitted 1 audio item because its size exceeds the output truncation budget]"
.to_string(),
},
])
);
assert!(
!format!("{output:?}").contains(&audio_data),
"over-budget audio data should not be retained"
);
}
other => panic!("unexpected history item: {other:?}"),
}
}
#[test]
fn record_items_truncates_custom_tool_call_output_content() {
let mut history = ContextManager::new();

View File

@@ -10,6 +10,8 @@ use tracing::info;
const IMAGE_CONTENT_OMITTED_PLACEHOLDER: &str =
"image content omitted because you do not support image input";
const AUDIO_CONTENT_OMITTED_PLACEHOLDER: &str =
"audio content omitted because you do not support audio input";
pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
// Collect synthetic outputs to insert immediately after their calls.
@@ -290,14 +292,14 @@ where
}
}
/// Strip image content from messages and tool outputs when the model does not support images.
/// When `input_modalities` contains `InputModality::Image`, no stripping is performed.
pub(crate) fn strip_images_when_unsupported(
/// Strip unsupported media content from messages and tool outputs.
pub(crate) fn strip_unsupported_media_content(
input_modalities: &[InputModality],
items: &mut [ResponseItem],
) {
let supports_images = input_modalities.contains(&InputModality::Image);
if supports_images {
let supports_audio = input_modalities.contains(&InputModality::Audio);
if supports_images && supports_audio {
return;
}
@@ -307,7 +309,7 @@ pub(crate) fn strip_images_when_unsupported(
let mut normalized_content = Vec::with_capacity(content.len());
for content_item in content.iter() {
match content_item {
ContentItem::InputImage { .. } => {
ContentItem::InputImage { .. } if !supports_images => {
normalized_content.push(ContentItem::InputText {
text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(),
});
@@ -323,20 +325,29 @@ pub(crate) fn strip_images_when_unsupported(
let mut normalized_content_items = Vec::with_capacity(content_items.len());
for content_item in content_items.iter() {
match content_item {
FunctionCallOutputContentItem::InputImage { .. } => {
FunctionCallOutputContentItem::InputImage { .. }
if !supports_images =>
{
normalized_content_items.push(
FunctionCallOutputContentItem::InputText {
text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(),
},
);
}
FunctionCallOutputContentItem::InputAudio { .. } if !supports_audio => {
normalized_content_items.push(
FunctionCallOutputContentItem::InputText {
text: AUDIO_CONTENT_OMITTED_PLACEHOLDER.to_string(),
},
);
}
_ => normalized_content_items.push(content_item.clone()),
}
}
*content_items = normalized_content_items;
}
}
ResponseItem::ImageGenerationCall { result, .. } => {
ResponseItem::ImageGenerationCall { result, .. } if !supports_images => {
result.clear();
}
_ => {}

View File

@@ -582,13 +582,8 @@ async fn execute_mcp_tool_call(
)
.await
.map_err(|e| format!("tool call error: {e:?}"))?;
let result = sanitize_mcp_tool_result_for_model(
turn_context
.model_info
.input_modalities
.contains(&InputModality::Image),
Ok(result),
)?;
let result =
sanitize_mcp_tool_result_for_model(&turn_context.model_info.input_modalities, Ok(result))?;
Ok(maybe_request_codex_apps_auth_elicitation(
sess,
turn_context,
@@ -776,36 +771,61 @@ async fn maybe_mark_thread_memory_mode_polluted(
}
fn sanitize_mcp_tool_result_for_model(
supports_image_input: bool,
input_modalities: &[InputModality],
result: Result<CallToolResult, String>,
) -> Result<CallToolResult, String> {
if supports_image_input {
return result;
}
let supports_image_input = input_modalities.contains(&InputModality::Image);
let supports_audio_input = input_modalities.contains(&InputModality::Audio);
result.map(|call_tool_result| CallToolResult {
content: call_tool_result
.content
.iter()
.map(|block| {
if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
&& content_type == "image"
{
return serde_json::json!({
"type": "text",
"text": "<image content omitted because you do not support image input>",
});
}
result.and_then(|call_tool_result| {
if !supports_audio_input
&& !has_non_null_structured_content(&call_tool_result)
&& call_tool_result
.content
.iter()
.any(|block| block.get("type").and_then(serde_json::Value::as_str) == Some("audio"))
{
return Err(
"audio content returned by MCP tool but the selected model does not support audio input"
.to_string(),
);
}
block.clone()
})
.collect::<Vec<_>>(),
structured_content: call_tool_result.structured_content,
is_error: call_tool_result.is_error,
meta: call_tool_result.meta,
if supports_image_input {
return Ok(call_tool_result);
}
Ok(CallToolResult {
content: call_tool_result
.content
.iter()
.map(|block| {
if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
&& content_type == "image"
{
return serde_json::json!({
"type": "text",
"text": "<image content omitted because you do not support image input>",
});
}
block.clone()
})
.collect::<Vec<_>>(),
structured_content: call_tool_result.structured_content,
is_error: call_tool_result.is_error,
meta: call_tool_result.meta,
})
})
}
fn has_non_null_structured_content(call_tool_result: &CallToolResult) -> bool {
call_tool_result
.structured_content
.as_ref()
.is_some_and(|structured_content| !structured_content.is_null())
}
fn truncate_mcp_tool_result_for_event(
result: &Result<CallToolResult, String>,
) -> Result<CallToolResult, String> {

View File

@@ -924,7 +924,7 @@ fn sanitize_mcp_tool_result_for_model_rewrites_image_content() {
meta: None,
});
let got = sanitize_mcp_tool_result_for_model(/*supports_image_input*/ false, result)
let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result)
.expect("sanitized result");
assert_eq!(
@@ -956,7 +956,7 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
};
let got = sanitize_mcp_tool_result_for_model(
/*supports_image_input*/ true,
&[InputModality::Text, InputModality::Image],
Ok(original.clone()),
)
.expect("unsanitized result");
@@ -964,6 +964,73 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
assert_eq!(got, original);
}
#[test]
fn sanitize_mcp_tool_result_for_model_rejects_audio_when_unsupported() {
let result = Ok(CallToolResult {
content: vec![serde_json::json!({
"type": "audio",
"data": "UklGRg==",
"mimeType": "audio/wav",
})],
structured_content: None,
is_error: Some(false),
meta: None,
});
let err = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result)
.expect_err("unsupported audio should fail");
assert_eq!(
err,
"audio content returned by MCP tool but the selected model does not support audio input"
);
}
#[test]
fn sanitize_mcp_tool_result_for_model_preserves_audio_when_supported() {
let original = CallToolResult {
content: vec![serde_json::json!({
"type": "audio",
"data": "UklGRg==",
"mimeType": "audio/wav",
})],
structured_content: None,
is_error: Some(false),
meta: Some(serde_json::json!({"k": "v"})),
};
let got = sanitize_mcp_tool_result_for_model(
&[
InputModality::Text,
InputModality::Image,
InputModality::Audio,
],
Ok(original.clone()),
)
.expect("supported audio should remain unchanged");
assert_eq!(got, original);
}
#[test]
fn sanitize_mcp_tool_result_for_model_lets_structured_content_take_precedence_over_audio() {
let original = CallToolResult {
content: vec![serde_json::json!({
"type": "audio",
"data": "UklGRg==",
"mimeType": "audio/wav",
})],
structured_content: Some(serde_json::json!({"answer": "structured"})),
is_error: Some(false),
meta: None,
};
let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], Ok(original.clone()))
.expect("structured content should take precedence");
assert_eq!(got, original);
}
#[test]
fn truncate_mcp_tool_result_for_event_preserves_small_result() {
let original = CallToolResult {

View File

@@ -7,8 +7,7 @@ use std::collections::BTreeMap;
pub(crate) fn create_code_mode_tool(
enabled_tools: &[CodeModeToolDefinition],
namespace_descriptions: &BTreeMap<String, codex_code_mode::ToolNamespaceDescription>,
code_mode_only: bool,
deferred_tools_available: bool,
options: codex_code_mode::ExecToolDescriptionOptions,
) -> ToolSpec {
const CODE_MODE_FREEFORM_GRAMMAR: &str = r#"
start: pragma_source | plain_source
@@ -25,8 +24,7 @@ SOURCE: /[\s\S]+/
description: codex_code_mode::build_exec_tool_description(
enabled_tools,
namespace_descriptions,
code_mode_only,
deferred_tools_available,
options,
),
format: FreeformToolFormat {
r#type: "grammar".to_string(),
@@ -57,16 +55,22 @@ mod tests {
create_code_mode_tool(
&enabled_tools,
&BTreeMap::new(),
/*code_mode_only*/ true,
/*deferred_tools_available*/ false,
codex_code_mode::ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
},
),
ToolSpec::Freeform(FreeformTool {
name: codex_code_mode::PUBLIC_TOOL_NAME.to_string(),
description: codex_code_mode::build_exec_tool_description(
&enabled_tools,
&BTreeMap::new(),
/*code_mode_only*/ true,
/*deferred_tools_available*/ false
codex_code_mode::ExecToolDescriptionOptions {
code_mode_only: true,
deferred_tools_available: false,
supports_audio_input: false,
}
),
format: FreeformToolFormat {
r#type: "grammar".to_string(),

View File

@@ -14,6 +14,7 @@ use codex_code_mode::RuntimeResponse;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::openai_models::InputModality;
use serde_json::Value as JsonValue;
use tokio_util::sync::CancellationToken;
@@ -168,6 +169,9 @@ pub(super) async fn handle_runtime_response(
match response {
RuntimeResponse::Yielded { content_items, .. } => {
let mut content_items = into_function_call_output_content_items(content_items);
if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
return Ok(output);
}
sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
content_items = truncate_code_mode_result(content_items, max_output_tokens);
prepend_script_status(&mut content_items, &script_status, started_at.elapsed());
@@ -175,6 +179,9 @@ pub(super) async fn handle_runtime_response(
}
RuntimeResponse::Terminated { content_items, .. } => {
let mut content_items = into_function_call_output_content_items(content_items);
if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
return Ok(output);
}
sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
content_items = truncate_code_mode_result(content_items, max_output_tokens);
prepend_script_status(&mut content_items, &script_status, started_at.elapsed());
@@ -187,12 +194,15 @@ pub(super) async fn handle_runtime_response(
..
} => {
let mut content_items = into_function_call_output_content_items(content_items);
sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
exec.session
.services
.code_mode_service
.replace_stored_values(stored_values)
.await;
if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
return Ok(output);
}
sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
let success = error_text.is_none();
if let Some(error_text) = error_text {
content_items.push(FunctionCallOutputContentItem::InputText {
@@ -209,6 +219,29 @@ pub(super) async fn handle_runtime_response(
}
}
fn unsupported_audio_output(
turn: &TurnContext,
items: &[FunctionCallOutputContentItem],
) -> Option<FunctionToolOutput> {
let supports_audio = turn
.model_info
.input_modalities
.contains(&InputModality::Audio);
if supports_audio
|| !items
.iter()
.any(|item| matches!(item, FunctionCallOutputContentItem::InputAudio { .. }))
{
return None;
}
Some(FunctionToolOutput::from_text(
"audio content emitted by code mode but the selected model does not support audio input"
.to_string(),
Some(false),
))
}
fn sanitize_runtime_image_detail(turn: &TurnContext, items: &mut [FunctionCallOutputContentItem]) {
sanitize_image_detail_items(can_request_original_image_detail(&turn.model_info), items);
}

View File

@@ -40,6 +40,14 @@ impl IntoProtocol<FunctionCallOutputContentItem>
.or(Some(DEFAULT_IMAGE_DETAIL)),
}
}
codex_code_mode::FunctionCallOutputContentItem::InputAudio { input_audio } => {
FunctionCallOutputContentItem::InputAudio {
input_audio: codex_protocol::models::InputAudio {
data: input_audio.data,
format: input_audio.format,
},
}
}
}
}
}

View File

@@ -253,8 +253,11 @@ fn build_code_mode_executors(
create_code_mode_tool(
&enabled_tools,
&namespace_descriptions,
config.code_mode_only_enabled,
deferred_tools_available,
codex_code_mode::ExecToolDescriptionOptions {
code_mode_only: config.code_mode_only_enabled,
deferred_tools_available,
supports_audio_input: config.supports_audio_input,
},
),
code_mode_nested_tool_specs,
)),

View File

@@ -2352,6 +2352,70 @@ fn code_mode_exec_description_omits_nested_tool_details_when_not_code_mode_only(
assert!(!description.contains("### `view_image`"));
}
#[test]
fn code_mode_exec_audio_helper_docs_require_audio_input_support() {
let unsupported_model_info = model_info();
let mut supported_model_info = unsupported_model_info.clone();
supported_model_info.input_modalities = vec![
InputModality::Text,
InputModality::Image,
InputModality::Audio,
];
let mut features = Features::with_defaults();
features.enable(Feature::CodeMode);
let available_models = Vec::new();
let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams {
model_info: &unsupported_model_info,
available_models: &available_models,
features: &features,
image_generation_tool_auth_allowed: true,
web_search_mode: Some(WebSearchMode::Cached),
session_source: SessionSource::Cli,
permission_profile: &PermissionProfile::Disabled,
windows_sandbox_level: WindowsSandboxLevel::Disabled,
});
let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
model_info: &supported_model_info,
available_models: &available_models,
features: &features,
image_generation_tool_auth_allowed: true,
web_search_mode: Some(WebSearchMode::Cached),
session_source: SessionSource::Cli,
permission_profile: &PermissionProfile::Disabled,
windows_sandbox_level: WindowsSandboxLevel::Disabled,
});
let (unsupported_tools, _) = build_specs(
&unsupported_tools_config,
/*mcp_tools*/ None,
/*deferred_mcp_tools*/ None,
&[],
);
let ToolSpec::Freeform(FreeformTool {
description: unsupported_description,
..
}) = find_tool(&unsupported_tools, "exec")
else {
panic!("expected freeform tool");
};
assert!(!unsupported_description.contains("`audio(audioItem"));
let (supported_tools, _) = build_specs(
&supported_tools_config,
/*mcp_tools*/ None,
/*deferred_mcp_tools*/ None,
&[],
);
let ToolSpec::Freeform(FreeformTool {
description: supported_description,
..
}) = find_tool(&supported_tools, "exec")
else {
panic!("expected freeform tool");
};
assert!(supported_description.contains("`audio(audioItem"));
}
fn model_info() -> ModelInfo {
serde_json::from_value(json!({
"slug": "gpt-5-codex",

View File

@@ -12,6 +12,7 @@ use codex_protocol::dynamic_tools::DynamicToolCallOutputContentItem;
use codex_protocol::dynamic_tools::DynamicToolResponse;
use codex_protocol::dynamic_tools::DynamicToolSpec;
use codex_protocol::models::PermissionProfile;
use codex_protocol::openai_models::InputModality;
use codex_protocol::protocol::AskForApproval;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::Op;
@@ -175,6 +176,54 @@ async fn run_code_mode_turn(
Ok((test, second_mock))
}
async fn run_code_mode_turn_with_audio_model(
server: &MockServer,
prompt: &str,
code: &str,
) -> Result<(TestCodex, ResponseMock)> {
let mut builder = test_codex()
.with_model("gpt-5.4")
.with_config(move |config| {
let _ = config.features.enable(Feature::CodeMode);
let mut model_catalog = bundled_models_response()
.unwrap_or_else(|err| panic!("bundled models.json should parse: {err}"));
let model = model_catalog
.models
.iter_mut()
.find(|model| model.slug == "gpt-5.4")
.expect("gpt-5.4 exists in bundled models.json");
model.input_modalities = vec![
InputModality::Text,
InputModality::Image,
InputModality::Audio,
];
config.model_catalog = Some(model_catalog);
});
let test = builder.build(server).await?;
responses::mount_sse_once(
server,
sse(vec![
ev_response_created("resp-1"),
ev_custom_tool_call("call-1", "exec", code),
ev_completed("resp-1"),
]),
)
.await;
let second_mock = responses::mount_sse_once(
server,
sse(vec![
ev_assistant_message("msg-1", "done"),
ev_completed("resp-2"),
]),
)
.await;
test.submit_turn(prompt).await?;
Ok((test, second_mock))
}
async fn run_code_mode_turn_with_rmcp(
server: &MockServer,
prompt: &str,
@@ -1974,6 +2023,77 @@ image("data:image/png;base64,AAA");
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_output_audio_via_global_helper_for_audio_model() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn_with_audio_model(
&server,
"use exec to return audio",
r#"
audio({ data: "BASE64", format: "wav" });
audio({ data: "data:audio/mpeg;base64,MP3BASE64" });
"#,
)
.await?;
let req = second_mock.single_request();
let items = custom_tool_output_items(&req, "call-1");
let (_, success) = custom_tool_output_body_and_success(&req, "call-1");
assert_ne!(
success,
Some(false),
"code_mode audio output failed unexpectedly"
);
assert_eq!(items.len(), 3);
assert_regex_match(
concat!(
r"(?s)\A",
r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
),
text_item(&items, /*index*/ 0),
);
assert_eq!(
items[1],
serde_json::json!({
"type": "input_audio",
"input_audio": { "data": "BASE64", "format": "wav" }
}),
);
assert_eq!(
items[2],
serde_json::json!({
"type": "input_audio",
"input_audio": { "data": "MP3BASE64", "format": "mp3" }
}),
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_audio_output_fails_for_non_audio_model() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let (_test, second_mock) = run_code_mode_turn(
&server,
"use exec to return audio",
r#"audio({ data: "BASE64", format: "wav" });"#,
)
.await?;
let req = second_mock.single_request();
let (output, _success) = custom_tool_output_body_and_success(&req, "call-1");
assert_eq!(
output,
"audio content emitted by code mode but the selected model does not support audio input"
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> {
skip_if_no_network!(Ok(()));
@@ -2404,6 +2524,7 @@ text(JSON.stringify(Object.getOwnPropertyNames(globalThis).sort()));
"WeakSet",
"__codexContentItems",
"add_content",
"audio",
"decodeURI",
"decodeURIComponent",
"encodeURI",

View File

@@ -93,6 +93,50 @@ fn assert_wall_time_header(output: &str) {
assert_eq!(marker, "Output:");
}
fn test_model_info_with_modalities(
slug: &str,
description: &str,
input_modalities: Vec<InputModality>,
) -> ModelInfo {
ModelInfo {
slug: slug.to_string(),
display_name: slug.to_string(),
description: Some(description.to_string()),
default_reasoning_level: None,
supported_reasoning_levels: vec![ReasoningEffortPreset {
effort: codex_protocol::openai_models::ReasoningEffort::Medium,
description: "Medium".to_string(),
}],
shell_type: ConfigShellToolType::Default,
visibility: ModelVisibility::List,
supported_in_api: true,
priority: 1,
additional_speed_tiers: Vec::new(),
service_tiers: Vec::new(),
upgrade: None,
base_instructions: "base instructions".to_string(),
model_messages: None,
supports_reasoning_summaries: false,
default_reasoning_summary: ReasoningSummary::Auto,
support_verbosity: false,
default_verbosity: None,
availability_nux: None,
apply_patch_tool_type: None,
web_search_tool_type: Default::default(),
truncation_policy: TruncationPolicyConfig::bytes(/*limit*/ 10_000),
supports_parallel_tool_calls: false,
supports_image_detail_original: false,
context_window: Some(272_000),
max_context_window: None,
auto_compact_token_limit: None,
effective_context_window_percent: 95,
experimental_supported_tools: Vec::new(),
input_modalities,
used_fallback_model_metadata: false,
supports_search_tool: false,
}
}
fn read_only_user_turn(fixture: &TestCodex, text: impl Into<String>) -> Op {
read_only_user_turn_with_model(fixture, text, fixture.session_configured.model.clone())
}
@@ -154,7 +198,7 @@ fn remote_aware_stdio_server_bin() -> anyhow::Result<String> {
return Ok(bin);
};
// Keep the Docker path rewrite scoped to tests that use `build_remote_aware`.
// Keep the Docker path rewrite scoped to tests that use `build_with_remote_env`.
// Other MCP tests still start their stdio server from the orchestrator test
// process, even when the full-ci remote env is present.
//
@@ -1386,6 +1430,257 @@ async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Re
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_audio_responses_are_forwarded_for_audio_model() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let call_id = "audio-supported-1";
let server_name = "rmcp";
let namespace = format!("mcp__{server_name}__");
let audio_model_slug = "rmcp-audio-model";
let models_mock = mount_models_once(
&server,
ModelsResponse {
models: vec![test_model_info_with_modalities(
audio_model_slug,
"Test model with audio input support",
vec![
InputModality::Text,
InputModality::Image,
InputModality::Audio,
],
)],
},
)
.await;
mount_sse_once(
&server,
responses::sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"),
responses::ev_completed("resp-1"),
]),
)
.await;
let final_mock = mount_sse_once(
&server,
responses::sse(vec![
responses::ev_assistant_message("msg-1", "rmcp audio tool completed successfully."),
responses::ev_completed("resp-2"),
]),
)
.await;
let rmcp_test_server_bin = remote_aware_stdio_server_bin()?;
let fixture = test_codex()
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
.with_config(move |config| {
insert_mcp_server(
config,
server_name,
stdio_transport(
rmcp_test_server_bin,
Some(HashMap::from([
("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()),
(
"MCP_TEST_AUDIO_MIME_TYPE".to_string(),
"audio/mpeg".to_string(),
),
])),
Vec::new(),
),
TestMcpServerOptions {
experimental_environment: remote_aware_experimental_environment(),
..Default::default()
},
);
})
.build_with_remote_env(&server)
.await?;
fixture
.thread_manager
.get_models_manager()
.list_models(RefreshStrategy::Online)
.await;
assert_eq!(models_mock.requests().len(), 1);
fixture
.codex
.submit(read_only_user_turn_with_model(
&fixture,
"call the rmcp audio tool",
audio_model_slug.to_string(),
))
.await?;
wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallBegin(_))
})
.await;
wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallEnd(_))
})
.await;
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
let output_item = final_mock.single_request().function_call_output(call_id);
let output = output_item["output"]
.as_array()
.expect("audio MCP output should be content items");
assert_eq!(output.len(), 2);
assert_wall_time_header(
output[0]["text"]
.as_str()
.expect("first MCP audio output item should be wall-time text"),
);
assert_eq!(
output[1],
json!({
"type": "input_audio",
"input_audio": {
"data": "UklGRg==",
"format": "mp3",
},
})
);
server.verify().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_audio_responses_fail_for_text_only_model() -> anyhow::Result<()> {
skip_if_no_network!(Ok(()));
let server = responses::start_mock_server().await;
let call_id = "audio-text-only-1";
let server_name = "rmcp";
let namespace = format!("mcp__{server_name}__");
let text_only_model_slug = "rmcp-audio-text-only-model";
let models_mock = mount_models_once(
&server,
ModelsResponse {
models: vec![test_model_info_with_modalities(
text_only_model_slug,
"Test model without audio input support",
vec![InputModality::Text, InputModality::Image],
)],
},
)
.await;
mount_sse_once(
&server,
responses::sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"),
responses::ev_completed("resp-1"),
]),
)
.await;
let final_mock = mount_sse_once(
&server,
responses::sse(vec![
responses::ev_assistant_message("msg-1", "rmcp audio tool failed."),
responses::ev_completed("resp-2"),
]),
)
.await;
let rmcp_test_server_bin = remote_aware_stdio_server_bin()?;
let fixture = test_codex()
.with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
.with_config(move |config| {
insert_mcp_server(
config,
server_name,
stdio_transport(
rmcp_test_server_bin,
Some(HashMap::from([
("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()),
(
"MCP_TEST_AUDIO_MIME_TYPE".to_string(),
"audio/wav".to_string(),
),
])),
Vec::new(),
),
TestMcpServerOptions {
experimental_environment: remote_aware_experimental_environment(),
..Default::default()
},
);
})
.build_with_remote_env(&server)
.await?;
fixture
.thread_manager
.get_models_manager()
.list_models(RefreshStrategy::Online)
.await;
assert_eq!(models_mock.requests().len(), 1);
fixture
.codex
.submit(read_only_user_turn_with_model(
&fixture,
"call the rmcp audio tool",
text_only_model_slug.to_string(),
))
.await?;
wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallBegin(_))
})
.await;
let end_event = wait_for_event(&fixture.codex, |ev| {
matches!(ev, EventMsg::McpToolCallEnd(_))
})
.await;
let EventMsg::McpToolCallEnd(end) = end_event else {
unreachable!("event guard guarantees McpToolCallEnd");
};
assert_eq!(
end.result,
Err(
"audio content returned by MCP tool but the selected model does not support audio input"
.to_string()
)
);
wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
let output_item = final_mock.single_request().function_call_output(call_id);
let output_text = output_item
.get("output")
.and_then(Value::as_str)
.expect("function_call_output output should be a JSON string");
let wrapped_payload = split_wall_time_wrapped_output(output_text);
let output_json: Value = serde_json::from_str(wrapped_payload)
.expect("function_call_output output should be valid JSON");
assert_eq!(
output_json,
json!([{
"type": "text",
"text": "audio content returned by MCP tool but the selected model does not support audio input"
}])
);
server.verify().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
#[serial(mcp_test_value)]
async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {

View File

@@ -1314,6 +1314,98 @@ pub enum FunctionCallOutputContentItem {
#[ts(optional)]
detail: Option<ImageDetail>,
},
// Do not rename, these are serialized and used directly in the responses API.
InputAudio {
input_audio: InputAudio,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)]
pub struct InputAudio {
pub data: String,
pub format: String,
}
pub fn input_audio_from_data(
data: &str,
format: Option<&str>,
mime_type: Option<&str>,
) -> Option<InputAudio> {
if data.is_empty() {
return None;
}
let (data, data_url_format) = if let Some((data, format)) = parse_audio_data_url(data) {
(data, Some(format))
} else if data
.get(.."data:".len())
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
{
return None;
} else {
(data.to_string(), None)
};
if data.is_empty() {
return None;
}
let mime_type_format = match mime_type {
Some(mime_type) => Some(audio_format_from_mime_type(mime_type)?),
None => None,
};
let format = format
.and_then(normalize_audio_format)
.or(data_url_format)
.or(mime_type_format)?;
Some(InputAudio { data, format })
}
fn parse_audio_data_url(data_url: &str) -> Option<(String, String)> {
if data_url.len() < "data:".len()
|| !data_url
.get(.."data:".len())
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
{
return None;
}
let (metadata, data) = data_url["data:".len()..].split_once(',')?;
if !metadata
.split(';')
.any(|part| part.eq_ignore_ascii_case("base64"))
{
return None;
}
let mime_type = metadata.split(';').next()?;
let format = audio_format_from_mime_type(mime_type)?;
Some((data.to_string(), format))
}
fn audio_format_from_mime_type(mime_type: &str) -> Option<String> {
let media_type = mime_type.split(';').next()?.trim().to_ascii_lowercase();
let subtype = media_type.strip_prefix("audio/")?;
normalize_audio_format(subtype)
}
fn normalize_audio_format(format: &str) -> Option<String> {
let format = format.trim().to_ascii_lowercase();
if format.is_empty() {
return None;
}
if format.contains('/') {
return audio_format_from_mime_type(&format);
}
let format = format.strip_prefix("x-").unwrap_or(&format);
let format = match format {
"mpeg" => "mp3",
"wave" => "wav",
_ => format,
};
Some(format.to_string())
}
/// Converts structured function-call output content into plain text for
@@ -1321,7 +1413,7 @@ pub enum FunctionCallOutputContentItem {
///
/// This conversion is intentionally lossy:
/// - only `input_text` items are included
/// - image items are ignored
/// - image and audio items are ignored
///
/// We use this helper where callers still need a string representation (for
/// example telemetry previews or legacy string-only output paths) while keeping
@@ -1337,7 +1429,8 @@ pub fn function_call_output_content_items_to_text(
Some(text.as_str())
}
FunctionCallOutputContentItem::InputText { .. }
| FunctionCallOutputContentItem::InputImage { .. } => None,
| FunctionCallOutputContentItem::InputImage { .. }
| FunctionCallOutputContentItem::InputAudio { .. } => None,
})
.collect::<Vec<_>>();
@@ -1388,7 +1481,7 @@ impl FunctionCallOutputBody {
/// human-readable surfaces.
///
/// This conversion is intentionally lossy when the body contains content
/// items: image entries are dropped and text entries are joined with
/// items: image and audio entries are dropped and text entries are joined with
/// newlines.
pub fn to_text(&self) -> Option<String> {
match self {
@@ -1566,11 +1659,18 @@ fn convert_mcp_content_to_items(
#[serde(rename = "_meta", default)]
meta: Option<serde_json::Value>,
},
#[serde(rename = "audio")]
Audio {
data: String,
#[serde(rename = "mimeType", alias = "mime_type")]
mime_type: Option<String>,
},
#[serde(other)]
Unknown,
}
let mut saw_image = false;
let mut saw_audio = false;
let mut items = Vec::with_capacity(contents.len());
for content in contents {
@@ -1603,6 +1703,19 @@ fn convert_mcp_content_to_items(
.or(Some(DEFAULT_IMAGE_DETAIL)),
}
}
Ok(McpContent::Audio { data, mime_type }) => {
if let Some(input_audio) =
input_audio_from_data(&data, /*format*/ None, mime_type.as_deref())
{
saw_audio = true;
FunctionCallOutputContentItem::InputAudio { input_audio }
} else {
FunctionCallOutputContentItem::InputText {
text: serde_json::to_string(content)
.unwrap_or_else(|_| "<content>".to_string()),
}
}
}
Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText {
text: serde_json::to_string(content).unwrap_or_else(|_| "<content>".to_string()),
},
@@ -1610,7 +1723,11 @@ fn convert_mcp_content_to_items(
items.push(item);
}
if saw_image { Some(items) } else { None }
if saw_image || saw_audio {
Some(items)
} else {
None
}
}
// Implement Display so callers can treat the payload like a plain string when logging or doing
@@ -2248,6 +2365,198 @@ mod tests {
Ok(())
}
#[test]
fn serializes_audio_outputs_as_array() -> Result<()> {
let call_tool_result = CallToolResult {
content: vec![
serde_json::json!({"type":"text","text":"caption"}),
serde_json::json!({"type":"audio","data":"BASE64","mimeType":"audio/mpeg"}),
],
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
assert_eq!(payload.success, Some(true));
let Some(items) = payload.content_items() else {
panic!("expected content items");
};
let items = items.to_vec();
assert_eq!(
items,
vec![
FunctionCallOutputContentItem::InputText {
text: "caption".into(),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "BASE64".into(),
format: "mp3".into(),
},
},
]
);
let item = ResponseInputItem::FunctionCallOutput {
call_id: "call1".into(),
output: payload,
};
let json = serde_json::to_string(&item)?;
let v: serde_json::Value = serde_json::from_str(&json)?;
assert_eq!(
v.get("output").expect("output field"),
&serde_json::json!([
{ "type": "input_text", "text": "caption" },
{ "type": "input_audio", "input_audio": { "data": "BASE64", "format": "mp3" } }
])
);
Ok(())
}
#[test]
fn serializes_mixed_image_and_audio_outputs_as_array() {
let call_tool_result = CallToolResult {
content: vec![
serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}),
serde_json::json!({"type":"audio","data":"AUDIO","mimeType":"audio/wav"}),
],
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
let Some(items) = payload.content_items() else {
panic!("expected content items");
};
assert_eq!(
items,
[
FunctionCallOutputContentItem::InputImage {
image_url: "data:image/png;base64,IMAGE".into(),
detail: Some(DEFAULT_IMAGE_DETAIL),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "AUDIO".into(),
format: "wav".into(),
},
},
]
);
}
#[test]
fn strips_audio_data_urls_and_derives_format() {
let call_tool_result = CallToolResult {
content: vec![serde_json::json!({
"type": "audio",
"data": "data:audio/ogg;base64,T2dnUw",
})],
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
let Some(items) = payload.content_items() else {
panic!("expected content items");
};
assert_eq!(
items,
[FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "T2dnUw".into(),
format: "ogg".into(),
},
}]
);
}
#[test]
fn audio_without_derivable_format_falls_back_to_text_payload() {
let content = vec![serde_json::json!({
"type": "audio",
"data": "BASE64",
})];
let call_tool_result = CallToolResult {
content: content.clone(),
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
assert_eq!(
payload,
FunctionCallOutputPayload {
body: FunctionCallOutputBody::Text(serde_json::to_string(&content).unwrap()),
success: Some(true),
}
);
}
#[test]
fn malformed_audio_block_falls_back_to_text_inside_structured_payload() {
let malformed_audio = serde_json::json!({
"type": "audio",
"data": "data:image/png;base64,NOT_AUDIO",
});
let call_tool_result = CallToolResult {
content: vec![
serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}),
malformed_audio.clone(),
],
structured_content: None,
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
let Some(items) = payload.content_items() else {
panic!("expected content items");
};
assert_eq!(
items,
[
FunctionCallOutputContentItem::InputImage {
image_url: "data:image/png;base64,IMAGE".into(),
detail: Some(DEFAULT_IMAGE_DETAIL),
},
FunctionCallOutputContentItem::InputText {
text: serde_json::to_string(&malformed_audio).unwrap(),
},
]
);
}
#[test]
fn structured_content_precedence_ignores_audio_content() {
let call_tool_result = CallToolResult {
content: vec![serde_json::json!({
"type": "audio",
"data": "BASE64",
"mimeType": "audio/wav",
})],
structured_content: Some(serde_json::json!({ "ok": true })),
is_error: Some(false),
meta: None,
};
let payload = call_tool_result.into_function_call_output_payload();
assert_eq!(
payload,
FunctionCallOutputPayload {
body: FunctionCallOutputBody::Text("{\"ok\":true}".to_string()),
success: Some(true),
}
);
}
#[test]
fn serializes_custom_tool_image_outputs_as_array() -> Result<()> {
let item = ResponseInputItem::CustomToolCallOutput {

View File

@@ -82,6 +82,8 @@ pub enum InputModality {
Text,
/// Image attachments included in user turns.
Image,
/// Audio content included in tool payloads.
Audio,
}
/// Backward-compatible default when `input_modalities` is omitted on the wire.

View File

@@ -71,6 +71,7 @@ impl TestToolServer {
Self::cwd_tool(),
Self::sync_tool(),
Self::image_tool(),
Self::audio_tool(),
Self::image_scenario_tool(),
sandbox_meta_tool,
];
@@ -227,6 +228,24 @@ impl TestToolServer {
tool
}
fn audio_tool() -> Tool {
#[expect(clippy::expect_used)]
let schema: JsonObject = serde_json::from_value(serde_json::json!({
"type": "object",
"properties": {},
"additionalProperties": false
}))
.expect("audio tool schema should deserialize");
let mut tool = Tool::new(
Cow::Borrowed("audio"),
Cow::Borrowed("Return a single audio content block."),
Arc::new(schema),
);
tool.annotations = Some(ToolAnnotations::new().read_only(true));
tool
}
/// Tool intended for manual testing of Codex TUI rendering for MCP image tool results.
///
/// This exists to exercise edge cases where a `CallToolResult.content` includes image blocks
@@ -543,6 +562,20 @@ impl ServerHandler for TestToolServer {
data_b64, mime_type,
)]))
}
"audio" => {
let data =
std::env::var("MCP_TEST_AUDIO_DATA").unwrap_or_else(|_| "QkFTRTY0".to_string());
let mime_type = std::env::var("MCP_TEST_AUDIO_MIME_TYPE")
.unwrap_or_else(|_| "audio/wav".to_string());
Ok(CallToolResult::success(vec![rmcp::model::Annotated::new(
rmcp::model::RawContent::Audio(rmcp::model::RawAudioContent {
data,
mime_type,
}),
None,
)]))
}
"image_scenario" => {
let args = Self::parse_call_args::<ImageScenarioArgs>(&request, "image_scenario")?;
Self::image_scenario_result(args)

View File

@@ -113,6 +113,7 @@ pub struct ToolsConfig {
pub request_permissions_tool_enabled: bool,
pub code_mode_enabled: bool,
pub code_mode_only_enabled: bool,
pub supports_audio_input: bool,
pub can_request_original_image_detail: bool,
pub collab_tools: bool,
pub goal_tools: bool,
@@ -187,6 +188,7 @@ impl ToolsConfig {
&& features.enabled(Feature::Apps)
&& features.enabled(Feature::Plugins);
let include_original_image_detail = can_request_original_image_detail(model_info);
let supports_audio_input = model_info.input_modalities.contains(&InputModality::Audio);
// API-key auth bypasses Codex backend entitlement/tool normalization, so
// callers must confirm ChatGPT auth before exposing the built-in tool.
let include_image_gen_tool = *image_generation_tool_auth_allowed
@@ -252,6 +254,7 @@ impl ToolsConfig {
request_permissions_tool_enabled,
code_mode_enabled: include_code_mode,
code_mode_only_enabled: include_code_mode_only,
supports_audio_input,
can_request_original_image_detail: include_original_image_detail,
collab_tools: include_collab_tools,
goal_tools: include_goal_tools,

View File

@@ -261,6 +261,48 @@ fn image_generation_requires_feature_and_supported_model() {
assert!(!unsupported_tools_config.image_gen_tool);
}
#[test]
fn audio_input_support_tracks_model_modalities() {
let supported_model_info = ModelInfo {
input_modalities: vec![
InputModality::Text,
InputModality::Image,
InputModality::Audio,
],
..model_info()
};
let unsupported_model_info = ModelInfo {
input_modalities: vec![InputModality::Text, InputModality::Image],
..model_info()
};
let features = Features::with_defaults();
let available_models = Vec::new();
let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
model_info: &supported_model_info,
available_models: &available_models,
features: &features,
image_generation_tool_auth_allowed: true,
web_search_mode: Some(WebSearchMode::Cached),
session_source: SessionSource::Cli,
permission_profile: &PermissionProfile::Disabled,
windows_sandbox_level: WindowsSandboxLevel::Disabled,
});
let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams {
model_info: &unsupported_model_info,
available_models: &available_models,
features: &features,
image_generation_tool_auth_allowed: true,
web_search_mode: Some(WebSearchMode::Cached),
session_source: SessionSource::Cli,
permission_profile: &PermissionProfile::Disabled,
windows_sandbox_level: WindowsSandboxLevel::Disabled,
});
assert!(supported_tools_config.supports_audio_input);
assert!(!unsupported_tools_config.supports_audio_input);
}
#[test]
fn provider_capability_methods_disable_provider_bound_tool_surfaces() {
let model_info = model_info();

View File

@@ -209,7 +209,8 @@ fn content_items_to_code_mode_result(items: &[FunctionCallOutputContentItem]) ->
Some(image_url.clone())
}
FunctionCallOutputContentItem::InputText { .. }
| FunctionCallOutputContentItem::InputImage { .. } => None,
| FunctionCallOutputContentItem::InputImage { .. }
| FunctionCallOutputContentItem::InputAudio { .. } => None,
})
.collect::<Vec<_>>()
.join("\n"),

View File

@@ -1,6 +1,7 @@
//! Helpers for truncating tool and exec output using [`TruncationPolicy`](codex_protocol::protocol::TruncationPolicy).
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::InputAudio;
pub use codex_utils_string::approx_bytes_for_tokens;
pub use codex_utils_string::approx_token_count;
pub use codex_utils_string::approx_tokens_from_byte_count;
@@ -9,6 +10,9 @@ use codex_utils_string::truncate_middle_with_token_budget;
pub use codex_protocol::protocol::TruncationPolicy;
const INPUT_AUDIO_JSON_OVERHEAD_BYTES: usize =
r#"{"type":"input_audio","input_audio":{"data":"","format":""}}"#.len();
pub fn formatted_truncate_text(content: &str, policy: TruncationPolicy) -> String {
if content.len() <= policy.byte_budget() {
return content.to_string();
@@ -34,12 +38,16 @@ pub fn formatted_truncate_text_content_items_with_policy(
.iter()
.filter_map(|item| match item {
FunctionCallOutputContentItem::InputText { text } => Some(text.as_str()),
FunctionCallOutputContentItem::InputImage { .. } => None,
FunctionCallOutputContentItem::InputImage { .. }
| FunctionCallOutputContentItem::InputAudio { .. } => None,
})
.collect::<Vec<_>>();
if text_segments.is_empty() {
return (items.to_vec(), None);
return (
truncate_function_output_items_with_policy(items, policy),
None,
);
}
let mut combined = String::new();
@@ -50,22 +58,59 @@ pub fn formatted_truncate_text_content_items_with_policy(
combined.push_str(text);
}
if combined.len() <= policy.byte_budget() {
return (items.to_vec(), None);
let combined_cost = serialized_byte_cost_for_policy(combined.len(), policy);
let budget = budget_for_policy(policy);
if combined_cost <= budget {
let mut remaining_budget = budget.saturating_sub(combined_cost);
let mut out: Vec<FunctionCallOutputContentItem> = Vec::with_capacity(items.len());
let mut omitted_audio_items = 0usize;
for item in items {
match item {
FunctionCallOutputContentItem::InputText { text } => {
out.push(FunctionCallOutputContentItem::InputText { text: text.clone() });
}
FunctionCallOutputContentItem::InputImage { image_url, detail } => {
out.push(FunctionCallOutputContentItem::InputImage {
image_url: image_url.clone(),
detail: *detail,
});
}
FunctionCallOutputContentItem::InputAudio { input_audio } => {
push_audio_item_with_budget(
&mut out,
input_audio,
policy,
&mut remaining_budget,
&mut omitted_audio_items,
);
}
}
}
push_omitted_audio_summary(&mut out, omitted_audio_items);
return (out, None);
}
let mut out = vec![FunctionCallOutputContentItem::InputText {
text: formatted_truncate_text(&combined, policy),
}];
out.extend(items.iter().filter_map(|item| match item {
FunctionCallOutputContentItem::InputImage { image_url, detail } => {
Some(FunctionCallOutputContentItem::InputImage {
image_url: image_url.clone(),
detail: *detail,
})
let mut omitted_audio_items = 0usize;
for item in items {
match item {
FunctionCallOutputContentItem::InputImage { image_url, detail } => {
out.push(FunctionCallOutputContentItem::InputImage {
image_url: image_url.clone(),
detail: *detail,
});
}
FunctionCallOutputContentItem::InputAudio { .. } => {
omitted_audio_items += 1;
}
FunctionCallOutputContentItem::InputText { .. } => {}
}
FunctionCallOutputContentItem::InputText { .. } => None,
}));
}
push_omitted_audio_summary(&mut out, omitted_audio_items);
(out, Some(approx_token_count(&combined)))
}
@@ -75,11 +120,9 @@ pub fn truncate_function_output_items_with_policy(
policy: TruncationPolicy,
) -> Vec<FunctionCallOutputContentItem> {
let mut out: Vec<FunctionCallOutputContentItem> = Vec::with_capacity(items.len());
let mut remaining_budget = match policy {
TruncationPolicy::Bytes(_) => policy.byte_budget(),
TruncationPolicy::Tokens(_) => policy.token_budget(),
};
let mut remaining_budget = budget_for_policy(policy);
let mut omitted_text_items = 0usize;
let mut omitted_audio_items = 0usize;
for item in items {
match item {
@@ -89,10 +132,7 @@ pub fn truncate_function_output_items_with_policy(
continue;
}
let cost = match policy {
TruncationPolicy::Bytes(_) => text.len(),
TruncationPolicy::Tokens(_) => approx_token_count(text),
};
let cost = serialized_byte_cost_for_policy(text.len(), policy);
if cost <= remaining_budget {
out.push(FunctionCallOutputContentItem::InputText { text: text.clone() });
@@ -117,6 +157,15 @@ pub fn truncate_function_output_items_with_policy(
detail: *detail,
});
}
FunctionCallOutputContentItem::InputAudio { input_audio } => {
push_audio_item_with_budget(
&mut out,
input_audio,
policy,
&mut remaining_budget,
&mut omitted_audio_items,
);
}
}
}
@@ -125,10 +174,72 @@ pub fn truncate_function_output_items_with_policy(
text: format!("[omitted {omitted_text_items} text items ...]"),
});
}
push_omitted_audio_summary(&mut out, omitted_audio_items);
out
}
fn budget_for_policy(policy: TruncationPolicy) -> usize {
match policy {
TruncationPolicy::Bytes(_) => policy.byte_budget(),
TruncationPolicy::Tokens(_) => policy.token_budget(),
}
}
fn serialized_byte_cost_for_policy(byte_count: usize, policy: TruncationPolicy) -> usize {
match policy {
TruncationPolicy::Bytes(_) => byte_count,
TruncationPolicy::Tokens(_) => {
usize::try_from(approx_tokens_from_byte_count(byte_count)).unwrap_or(usize::MAX)
}
}
}
fn push_audio_item_with_budget(
out: &mut Vec<FunctionCallOutputContentItem>,
input_audio: &InputAudio,
policy: TruncationPolicy,
remaining_budget: &mut usize,
omitted_audio_items: &mut usize,
) {
// Preserve audio only when the payload fits the remaining output budget.
let byte_count = INPUT_AUDIO_JSON_OVERHEAD_BYTES
.saturating_add(input_audio.data.len())
.saturating_add(input_audio.format.len());
let cost = serialized_byte_cost_for_policy(byte_count, policy);
if cost <= *remaining_budget {
out.push(FunctionCallOutputContentItem::InputAudio {
input_audio: input_audio.clone(),
});
*remaining_budget = remaining_budget.saturating_sub(cost);
} else {
*omitted_audio_items += 1;
}
}
fn push_omitted_audio_summary(
out: &mut Vec<FunctionCallOutputContentItem>,
omitted_audio_items: usize,
) {
if omitted_audio_items > 0 {
let item_word = if omitted_audio_items == 1 {
"item"
} else {
"items"
};
let owner = if omitted_audio_items == 1 {
"its"
} else {
"their"
};
out.push(FunctionCallOutputContentItem::InputText {
text: format!(
"[omitted {omitted_audio_items} audio {item_word} because {owner} size exceeds the output truncation budget]"
),
});
}
}
pub fn approx_tokens_from_byte_count_i64(bytes: i64) -> i64 {
if bytes <= 0 {
return 0;

View File

@@ -7,8 +7,11 @@ use crate::truncate_function_output_items_with_policy;
use crate::truncate_text;
use codex_protocol::models::DEFAULT_IMAGE_DETAIL;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::InputAudio;
use pretty_assertions::assert_eq;
const SMALL_AUDIO_SERIALIZED_BYTES: usize = 71;
#[test]
fn truncate_bytes_less_than_placeholder_returns_placeholder() {
let content = "example output";
@@ -251,6 +254,141 @@ fn formatted_truncate_text_content_items_with_policy_merges_text_and_appends_ima
assert_eq!(original_token_count, Some(4));
}
#[test]
fn formatted_truncate_text_content_items_with_policy_preserves_audio_when_budget_allows() {
let items = vec![
FunctionCallOutputContentItem::InputText {
text: "abcd".to_string(),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "UklGRg==".to_string(),
format: "wav".to_string(),
},
},
FunctionCallOutputContentItem::InputText {
text: "efgh".to_string(),
},
];
let (output, original_token_count) = formatted_truncate_text_content_items_with_policy(
&items,
TruncationPolicy::Bytes(SMALL_AUDIO_SERIALIZED_BYTES + "abcd\nefgh".len()),
);
assert_eq!(output, items);
assert_eq!(original_token_count, None);
}
#[test]
fn formatted_truncate_text_content_items_with_policy_omits_audio_when_budget_is_spent() {
let items = vec![
FunctionCallOutputContentItem::InputText {
text: "abcd".to_string(),
},
FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "UklGRg==".to_string(),
format: "wav".to_string(),
},
},
FunctionCallOutputContentItem::InputText {
text: "efgh".to_string(),
},
];
let (output, original_token_count) =
formatted_truncate_text_content_items_with_policy(&items, TruncationPolicy::Bytes(4));
assert_eq!(
output,
vec![
FunctionCallOutputContentItem::InputText {
text: "Total output lines: 2\n\nab…5 chars truncated…gh".to_string(),
},
FunctionCallOutputContentItem::InputText {
text:
"[omitted 1 audio item because its size exceeds the output truncation budget]"
.to_string(),
},
]
);
assert_eq!(original_token_count, Some(3));
}
#[test]
fn formatted_truncate_text_content_items_with_policy_omits_audio_only_over_budget() {
let items = vec![FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "A".repeat(200),
format: "wav".to_string(),
},
}];
let (output, original_token_count) =
formatted_truncate_text_content_items_with_policy(&items, TruncationPolicy::Bytes(32));
assert_eq!(
output,
vec![FunctionCallOutputContentItem::InputText {
text: "[omitted 1 audio item because its size exceeds the output truncation budget]"
.to_string(),
}]
);
assert_eq!(original_token_count, None);
}
#[test]
fn truncate_function_output_items_with_policy_omits_audio_over_budget() {
let items = vec![FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "A".repeat(200),
format: "wav".to_string(),
},
}];
let output = truncate_function_output_items_with_policy(&items, TruncationPolicy::Bytes(32));
assert_eq!(
output,
vec![FunctionCallOutputContentItem::InputText {
text: "[omitted 1 audio item because its size exceeds the output truncation budget]"
.to_string(),
}]
);
}
#[test]
fn truncate_function_output_items_with_policy_charges_preserved_audio_to_budget() {
let audio = FunctionCallOutputContentItem::InputAudio {
input_audio: InputAudio {
data: "UklGRg==".to_string(),
format: "wav".to_string(),
},
};
let items = vec![
audio.clone(),
FunctionCallOutputContentItem::InputText {
text: "tail".to_string(),
},
];
let output = truncate_function_output_items_with_policy(
&items,
TruncationPolicy::Bytes(SMALL_AUDIO_SERIALIZED_BYTES),
);
assert_eq!(
output,
vec![
audio,
FunctionCallOutputContentItem::InputText {
text: "[omitted 1 text items ...]".to_string(),
},
]
);
}
#[test]
fn formatted_truncate_text_content_items_with_policy_merges_all_text_for_token_budget() {
let items = vec![