mirror of
https://github.com/openai/codex.git
synced 2026-05-28 15:00:16 +00:00
Support audio input
This commit is contained in:
@@ -9,7 +9,7 @@ use crate::PUBLIC_TOOL_NAME;
|
||||
const MAX_JS_SAFE_INTEGER: u64 = (1_u64 << 53) - 1;
|
||||
const DEFERRED_NESTED_TOOLS_GUIDANCE: &str = r#"Some nested MCP/app tools may be omitted from this description. They are still available on the global `tools` object and listed in `ALL_TOOLS`.
|
||||
To find one, filter `ALL_TOOLS` by `name` and `description`."#;
|
||||
const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/compose tool calls
|
||||
const EXEC_DESCRIPTION_TEMPLATE_PREFIX: &str = r#"Run JavaScript code to orchestrate/compose tool calls
|
||||
- Evaluates the provided JavaScript code in a fresh V8 isolate as an async module.
|
||||
- All nested tools are available on the global `tools` object, for example `await tools.exec_command(...)`. Tool names are exposed as normalized JavaScript identifiers, for example `await tools.mcp__ologs__get_profile(...)`.
|
||||
- Nested tool methods take either a string or an object as their input argument.
|
||||
@@ -24,8 +24,9 @@ const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/co
|
||||
- Global helpers:
|
||||
- `exit()`: Immediately ends the current script successfully (like an early return from the top level).
|
||||
- `text(value: string | number | boolean | undefined | null)`: Appends a text item. Non-string values are stringified with `JSON.stringify(...)` when possible.
|
||||
- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null } | ImageContent, detail?: "auto" | "low" | "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument.
|
||||
- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
|
||||
- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null } | ImageContent, detail?: "auto" | "low" | "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument."#;
|
||||
const AUDIO_HELPER_DESCRIPTION: &str = r#"- `audio(audioItem: { data: string; format?: string | null; mimeType?: string | null; mime_type?: string | null } | AudioContent)`: Appends an audio item. `data` can be raw base64 audio or a base64-encoded `data:audio/...` URL. To forward an MCP tool audio block, pass an individual `AudioContent` block from `result.content`, for example `audio(result.content[0])`."#;
|
||||
const EXEC_DESCRIPTION_TEMPLATE_SUFFIX: &str = r#"- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
|
||||
- `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing.
|
||||
- `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`.
|
||||
- `setTimeout(callback: () => void, delayMs?: number)`: schedules a callback to run later and returns a timeout id. Pending timeouts do not keep `exec` alive by themselves; await an explicit promise if you need to wait for one.
|
||||
@@ -41,7 +42,7 @@ const WAIT_DESCRIPTION_TEMPLATE: &str = r#"- Use `wait` only after `exec` return
|
||||
- If the cell is still running, `wait` may yield again with the same `cell_id`.
|
||||
- If the cell has already finished, `wait` returns the completed result and closes the cell."#;
|
||||
// Based off of https://modelcontextprotocol.io/specification/draft/schema#calltoolresult
|
||||
const MCP_TYPESCRIPT_PREAMBLE: &str = r#"type Role = "user" | "assistant";
|
||||
const MCP_TYPESCRIPT_PREAMBLE_PREFIX: &str = r#"type Role = "user" | "assistant";
|
||||
type MetaObject = Record<string, unknown>;
|
||||
type Annotations = {
|
||||
audience?: Role[];
|
||||
@@ -79,14 +80,16 @@ type ImageContent = {
|
||||
annotations?: Annotations;
|
||||
_meta?: MetaObject;
|
||||
};
|
||||
type AudioContent = {
|
||||
"#;
|
||||
const MCP_AUDIO_CONTENT_TYPE: &str = r#"type AudioContent = {
|
||||
type: "audio";
|
||||
data: string;
|
||||
mimeType: string;
|
||||
annotations?: Annotations;
|
||||
_meta?: MetaObject;
|
||||
};
|
||||
type ResourceLink = {
|
||||
"#;
|
||||
const MCP_TYPESCRIPT_PREAMBLE_SUFFIX: &str = r#"type ResourceLink = {
|
||||
icons?: Icon[];
|
||||
name: string;
|
||||
title?: string;
|
||||
@@ -106,8 +109,10 @@ type EmbeddedResource = {
|
||||
};
|
||||
type ContentBlock =
|
||||
| TextContent
|
||||
| ImageContent
|
||||
| AudioContent
|
||||
| ImageContent"#;
|
||||
const MCP_AUDIO_CONTENT_BLOCK_VARIANT: &str = r#"
|
||||
| AudioContent"#;
|
||||
const MCP_TYPESCRIPT_PREAMBLE_END: &str = r#"
|
||||
| ResourceLink
|
||||
| EmbeddedResource;
|
||||
type CallToolResult<TStructured = { [key: string]: unknown }> = {
|
||||
@@ -143,6 +148,13 @@ pub struct ToolNamespaceDescription {
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub struct ExecToolDescriptionOptions {
|
||||
pub code_mode_only: bool,
|
||||
pub deferred_tools_available: bool,
|
||||
pub supports_audio_input: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Deserialize, PartialEq, Eq)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
struct CodeModeExecPragma {
|
||||
@@ -250,15 +262,21 @@ pub fn is_code_mode_nested_tool(tool_name: &str) -> bool {
|
||||
pub fn build_exec_tool_description(
|
||||
enabled_tools: &[ToolDefinition],
|
||||
namespace_descriptions: &BTreeMap<String, ToolNamespaceDescription>,
|
||||
code_mode_only: bool,
|
||||
deferred_tools_available: bool,
|
||||
options: ExecToolDescriptionOptions,
|
||||
) -> String {
|
||||
let mut sections = Vec::new();
|
||||
sections.push(EXEC_DESCRIPTION_TEMPLATE.to_string());
|
||||
if deferred_tools_available {
|
||||
let mut exec_description = String::from(EXEC_DESCRIPTION_TEMPLATE_PREFIX);
|
||||
if options.supports_audio_input {
|
||||
exec_description.push('\n');
|
||||
exec_description.push_str(AUDIO_HELPER_DESCRIPTION);
|
||||
}
|
||||
exec_description.push('\n');
|
||||
exec_description.push_str(EXEC_DESCRIPTION_TEMPLATE_SUFFIX);
|
||||
sections.push(exec_description);
|
||||
if options.deferred_tools_available {
|
||||
sections.push(DEFERRED_NESTED_TOOLS_GUIDANCE.to_string());
|
||||
}
|
||||
if !code_mode_only {
|
||||
if !options.code_mode_only {
|
||||
return sections.join("\n\n");
|
||||
}
|
||||
|
||||
@@ -305,8 +323,18 @@ pub fn build_exec_tool_description(
|
||||
}
|
||||
|
||||
if has_mcp_tools {
|
||||
let mut mcp_typescript_preamble = String::from(MCP_TYPESCRIPT_PREAMBLE_PREFIX);
|
||||
if options.supports_audio_input {
|
||||
mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_TYPE);
|
||||
}
|
||||
mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_SUFFIX);
|
||||
if options.supports_audio_input {
|
||||
mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_BLOCK_VARIANT);
|
||||
}
|
||||
mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_END);
|
||||
|
||||
sections.push(format!(
|
||||
"Shared MCP Types:\n```ts\n{MCP_TYPESCRIPT_PREAMBLE}\n```"
|
||||
"Shared MCP Types:\n```ts\n{mcp_typescript_preamble}\n```"
|
||||
));
|
||||
}
|
||||
let nested_tool_reference = nested_tool_sections.join("\n\n");
|
||||
@@ -706,6 +734,7 @@ fn render_json_schema_literal(value: &JsonValue) -> String {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::CodeModeToolKind;
|
||||
use super::ExecToolDescriptionOptions;
|
||||
use super::ParsedExecSource;
|
||||
use super::ToolDefinition;
|
||||
use super::ToolNamespaceDescription;
|
||||
@@ -863,8 +892,11 @@ mod tests {
|
||||
output_schema: None,
|
||||
}],
|
||||
&BTreeMap::new(),
|
||||
/*code_mode_only*/ true,
|
||||
/*deferred_tools_available*/ false,
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: true,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
assert!(description.contains(
|
||||
"### `foo`
|
||||
@@ -878,13 +910,41 @@ bar"
|
||||
let description = build_exec_tool_description(
|
||||
&[],
|
||||
&BTreeMap::new(),
|
||||
/*code_mode_only*/ false,
|
||||
/*deferred_tools_available*/ false,
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: false,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
assert!(description.contains("`setTimeout(callback: () => void, delayMs?: number)`"));
|
||||
assert!(description.contains("`clearTimeout(timeoutId?: number)`"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exec_description_gates_audio_helper_on_audio_input_support() {
|
||||
let unsupported_description = build_exec_tool_description(
|
||||
&[],
|
||||
&BTreeMap::new(),
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: false,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
assert!(!unsupported_description.contains("`audio(audioItem"));
|
||||
|
||||
let supported_description = build_exec_tool_description(
|
||||
&[],
|
||||
&BTreeMap::new(),
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: false,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: true,
|
||||
},
|
||||
);
|
||||
assert!(supported_description.contains("`audio(audioItem"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_mode_only_description_groups_namespace_instructions_once() {
|
||||
let namespace_descriptions = BTreeMap::from([(
|
||||
@@ -930,8 +990,11 @@ bar"
|
||||
},
|
||||
],
|
||||
&namespace_descriptions,
|
||||
/*code_mode_only*/ true,
|
||||
/*deferred_tools_available*/ false,
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: true,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
assert_eq!(description.matches("## mcp__sample").count(), 1);
|
||||
assert!(description.contains("## mcp__sample\nShared namespace guidance."));
|
||||
@@ -970,8 +1033,11 @@ bar"
|
||||
}))),
|
||||
}],
|
||||
&namespace_descriptions,
|
||||
/*code_mode_only*/ true,
|
||||
/*deferred_tools_available*/ false,
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: true,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
|
||||
assert!(!description.contains("## mcp__sample"));
|
||||
@@ -1069,8 +1135,11 @@ bar"
|
||||
},
|
||||
],
|
||||
&BTreeMap::new(),
|
||||
/*code_mode_only*/ true,
|
||||
/*deferred_tools_available*/ false,
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: true,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
@@ -1082,13 +1151,60 @@ bar"
|
||||
assert_eq!(description.matches("Shared MCP Types:").count(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_mode_only_description_gates_mcp_audio_type_on_audio_input_support() {
|
||||
let tools = vec![ToolDefinition {
|
||||
name: "mcp__sample__audio".to_string(),
|
||||
tool_name: ToolName::namespaced("mcp__sample__", "audio"),
|
||||
description: "Audio tool".to_string(),
|
||||
kind: CodeModeToolKind::Function,
|
||||
input_schema: Some(json!({
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"additionalProperties": false
|
||||
})),
|
||||
output_schema: Some(mcp_call_tool_result_schema(json!({
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"additionalProperties": false
|
||||
}))),
|
||||
}];
|
||||
|
||||
let unsupported_description = build_exec_tool_description(
|
||||
&tools,
|
||||
&BTreeMap::new(),
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: true,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
assert!(!unsupported_description.contains("type AudioContent"));
|
||||
assert!(!unsupported_description.contains("| AudioContent"));
|
||||
|
||||
let supported_description = build_exec_tool_description(
|
||||
&tools,
|
||||
&BTreeMap::new(),
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: true,
|
||||
deferred_tools_available: false,
|
||||
supports_audio_input: true,
|
||||
},
|
||||
);
|
||||
assert!(supported_description.contains("type AudioContent"));
|
||||
assert!(supported_description.contains("| AudioContent"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exec_description_mentions_deferred_nested_tools_when_available() {
|
||||
let description = build_exec_tool_description(
|
||||
&[],
|
||||
&BTreeMap::new(),
|
||||
/*code_mode_only*/ false,
|
||||
/*deferred_tools_available*/ true,
|
||||
ExecToolDescriptionOptions {
|
||||
code_mode_only: false,
|
||||
deferred_tools_available: true,
|
||||
supports_audio_input: false,
|
||||
},
|
||||
);
|
||||
|
||||
assert!(description.contains("Some nested MCP/app tools may be omitted"));
|
||||
|
||||
@@ -5,6 +5,7 @@ mod service;
|
||||
|
||||
pub use description::CODE_MODE_PRAGMA_PREFIX;
|
||||
pub use description::CodeModeToolKind;
|
||||
pub use description::ExecToolDescriptionOptions;
|
||||
pub use description::ToolDefinition;
|
||||
pub use description::ToolNamespaceDescription;
|
||||
pub use description::augment_tool_definition;
|
||||
@@ -18,6 +19,7 @@ pub use description::render_json_schema_to_typescript;
|
||||
pub use response::DEFAULT_IMAGE_DETAIL;
|
||||
pub use response::FunctionCallOutputContentItem;
|
||||
pub use response::ImageDetail;
|
||||
pub use response::InputAudio;
|
||||
pub use runtime::CodeModeNestedToolCall;
|
||||
pub use runtime::DEFAULT_EXEC_YIELD_TIME_MS;
|
||||
pub use runtime::DEFAULT_MAX_OUTPUT_TOKENS_PER_EXEC_CALL;
|
||||
|
||||
@@ -23,4 +23,13 @@ pub enum FunctionCallOutputContentItem {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
detail: Option<ImageDetail>,
|
||||
},
|
||||
InputAudio {
|
||||
input_audio: InputAudio,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct InputAudio {
|
||||
pub data: String,
|
||||
pub format: String,
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use super::RuntimeEvent;
|
||||
use super::RuntimeState;
|
||||
use super::timers;
|
||||
use super::value::json_to_v8;
|
||||
use super::value::normalize_output_audio;
|
||||
use super::value::normalize_output_image;
|
||||
use super::value::serialize_output_text;
|
||||
use super::value::throw_type_error;
|
||||
@@ -129,6 +130,26 @@ pub(super) fn image_callback(
|
||||
retval.set(v8::undefined(scope).into());
|
||||
}
|
||||
|
||||
pub(super) fn audio_callback(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
args: v8::FunctionCallbackArguments,
|
||||
mut retval: v8::ReturnValue<v8::Value>,
|
||||
) {
|
||||
let value = if args.length() == 0 {
|
||||
v8::undefined(scope).into()
|
||||
} else {
|
||||
args.get(0)
|
||||
};
|
||||
let audio_item = match normalize_output_audio(scope, value) {
|
||||
Ok(audio_item) => audio_item,
|
||||
Err(()) => return,
|
||||
};
|
||||
if let Some(state) = scope.get_slot::<RuntimeState>() {
|
||||
let _ = state.event_tx.send(RuntimeEvent::ContentItem(audio_item));
|
||||
}
|
||||
retval.set(v8::undefined(scope).into());
|
||||
}
|
||||
|
||||
pub(super) fn store_callback(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
args: v8::FunctionCallbackArguments,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::RuntimeState;
|
||||
use super::callbacks::audio_callback;
|
||||
use super::callbacks::clear_timeout_callback;
|
||||
use super::callbacks::exit_callback;
|
||||
use super::callbacks::image_callback;
|
||||
@@ -23,6 +24,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
|
||||
let set_timeout = helper_function(scope, "setTimeout", set_timeout_callback)?;
|
||||
let text = helper_function(scope, "text", text_callback)?;
|
||||
let image = helper_function(scope, "image", image_callback)?;
|
||||
let audio = helper_function(scope, "audio", audio_callback)?;
|
||||
let store = helper_function(scope, "store", store_callback)?;
|
||||
let load = helper_function(scope, "load", load_callback)?;
|
||||
let notify = helper_function(scope, "notify", notify_callback)?;
|
||||
@@ -35,6 +37,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
|
||||
set_global(scope, global, "setTimeout", set_timeout.into())?;
|
||||
set_global(scope, global, "text", text.into())?;
|
||||
set_global(scope, global, "image", image.into())?;
|
||||
set_global(scope, global, "audio", audio.into())?;
|
||||
set_global(scope, global, "store", store.into())?;
|
||||
set_global(scope, global, "load", load.into())?;
|
||||
set_global(scope, global, "notify", notify.into())?;
|
||||
|
||||
@@ -3,8 +3,10 @@ use serde_json::Value as JsonValue;
|
||||
use crate::response::DEFAULT_IMAGE_DETAIL;
|
||||
use crate::response::FunctionCallOutputContentItem;
|
||||
use crate::response::ImageDetail;
|
||||
use crate::response::InputAudio;
|
||||
|
||||
const IMAGE_HELPER_EXPECTS_MESSAGE: &str = "image expects a non-empty image URL string, an object with image_url and optional detail, or a raw MCP image block";
|
||||
const AUDIO_HELPER_EXPECTS_MESSAGE: &str = "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block";
|
||||
const CODEX_IMAGE_DETAIL_META_KEY: &str = "codex/imageDetail";
|
||||
|
||||
pub(super) fn serialize_output_text(
|
||||
@@ -97,6 +99,35 @@ pub(super) fn normalize_output_image(
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn normalize_output_audio(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
value: v8::Local<'_, v8::Value>,
|
||||
) -> Result<FunctionCallOutputContentItem, ()> {
|
||||
let result = (|| -> Result<FunctionCallOutputContentItem, String> {
|
||||
if !value.is_object() || value.is_array() {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
}
|
||||
|
||||
let object = v8::Local::<v8::Object>::try_from(value)
|
||||
.map_err(|_| AUDIO_HELPER_EXPECTS_MESSAGE.to_string())?;
|
||||
let input_audio = if let Some(audio) = parse_non_mcp_output_audio(scope, object)? {
|
||||
audio
|
||||
} else {
|
||||
parse_mcp_output_audio(scope, value)?
|
||||
};
|
||||
|
||||
Ok(FunctionCallOutputContentItem::InputAudio { input_audio })
|
||||
})();
|
||||
|
||||
match result {
|
||||
Ok(item) => Ok(item),
|
||||
Err(error_text) => {
|
||||
throw_type_error(scope, &error_text);
|
||||
Err(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_non_mcp_output_image(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
object: v8::Local<'_, v8::Object>,
|
||||
@@ -165,6 +196,90 @@ fn parse_mcp_output_image(
|
||||
Ok((image_url, detail))
|
||||
}
|
||||
|
||||
fn parse_non_mcp_output_audio(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
object: v8::Local<'_, v8::Object>,
|
||||
) -> Result<Option<InputAudio>, String> {
|
||||
let data_key = v8::String::new(scope, "data")
|
||||
.ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
|
||||
let Some(data) = object.get(scope, data_key.into()) else {
|
||||
return Ok(None);
|
||||
};
|
||||
if data.is_undefined() {
|
||||
return Ok(None);
|
||||
}
|
||||
if !data.is_string() {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
}
|
||||
let data = data.to_rust_string_lossy(scope);
|
||||
let format = optional_string_property(scope, object, "format")?;
|
||||
let mime_type = optional_string_property(scope, object, "mimeType")?
|
||||
.or(optional_string_property(scope, object, "mime_type")?);
|
||||
let Some(input_audio) = codex_protocol::models::input_audio_from_data(
|
||||
&data,
|
||||
format.as_deref(),
|
||||
mime_type.as_deref(),
|
||||
) else {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
};
|
||||
Ok(Some(InputAudio {
|
||||
data: input_audio.data,
|
||||
format: input_audio.format,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_mcp_output_audio(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
value: v8::Local<'_, v8::Value>,
|
||||
) -> Result<InputAudio, String> {
|
||||
let Some(result) = v8_value_to_json(scope, value)? else {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
};
|
||||
let JsonValue::Object(result) = result else {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
};
|
||||
let Some(item_type) = result.get("type").and_then(JsonValue::as_str) else {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
};
|
||||
if item_type != "audio" {
|
||||
return Err(format!(
|
||||
"audio only accepts MCP audio blocks, got \"{item_type}\""
|
||||
));
|
||||
}
|
||||
let data = result
|
||||
.get("data")
|
||||
.and_then(JsonValue::as_str)
|
||||
.ok_or_else(|| "audio expected MCP audio data".to_string())?;
|
||||
let mime_type = result
|
||||
.get("mimeType")
|
||||
.or_else(|| result.get("mime_type"))
|
||||
.and_then(JsonValue::as_str);
|
||||
let Some(input_audio) =
|
||||
codex_protocol::models::input_audio_from_data(data, /*format*/ None, mime_type)
|
||||
else {
|
||||
return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
|
||||
};
|
||||
Ok(InputAudio {
|
||||
data: input_audio.data,
|
||||
format: input_audio.format,
|
||||
})
|
||||
}
|
||||
|
||||
fn optional_string_property(
|
||||
scope: &mut v8::PinScope<'_, '_>,
|
||||
object: v8::Local<'_, v8::Object>,
|
||||
name: &str,
|
||||
) -> Result<Option<String>, String> {
|
||||
let key = v8::String::new(scope, name)
|
||||
.ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
|
||||
match object.get(scope, key.into()) {
|
||||
Some(value) if value.is_string() => Ok(Some(value.to_rust_string_lossy(scope))),
|
||||
Some(value) if value.is_null() || value.is_undefined() => Ok(None),
|
||||
Some(_) => Err(format!("{name} must be a string when provided")),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_image_detail_value<'s>(
|
||||
scope: &mut v8::PinScope<'s, '_>,
|
||||
value: Option<v8::Local<'s, v8::Value>>,
|
||||
|
||||
@@ -703,6 +703,7 @@ mod tests {
|
||||
use super::run_session_control;
|
||||
use crate::CodeModeToolKind;
|
||||
use crate::FunctionCallOutputContentItem;
|
||||
use crate::InputAudio;
|
||||
use crate::ToolDefinition;
|
||||
use crate::runtime::ExecuteRequest;
|
||||
use crate::runtime::ExecuteToPendingOutcome;
|
||||
@@ -1230,6 +1231,7 @@ text(formatter.format(new Date("2025-01-02T03:04:05Z")));
|
||||
const returnsUndefined = [
|
||||
text("first"),
|
||||
image("https://example.com/image.jpg"),
|
||||
audio({ data: "BASE64", format: "wav" }),
|
||||
notify("ping"),
|
||||
].map((value) => value === undefined);
|
||||
text(JSON.stringify(returnsUndefined));
|
||||
@@ -1253,8 +1255,14 @@ text(JSON.stringify(returnsUndefined));
|
||||
image_url: "https://example.com/image.jpg".to_string(),
|
||||
detail: Some(crate::DEFAULT_IMAGE_DETAIL),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputAudio {
|
||||
input_audio: InputAudio {
|
||||
data: "BASE64".to_string(),
|
||||
format: "wav".to_string(),
|
||||
},
|
||||
},
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "[true,true,true]".to_string(),
|
||||
text: "[true,true,true,true]".to_string(),
|
||||
},
|
||||
],
|
||||
stored_values: HashMap::new(),
|
||||
@@ -1411,6 +1419,147 @@ image({
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn audio_helper_accepts_explicit_object() {
|
||||
let service = CodeModeService::new();
|
||||
|
||||
let response = service
|
||||
.execute(ExecuteRequest {
|
||||
source: r#"audio({ data: "BASE64", format: "wav" });"#.to_string(),
|
||||
yield_time_ms: None,
|
||||
..execute_request("")
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
response,
|
||||
RuntimeResponse::Result {
|
||||
cell_id: "1".to_string(),
|
||||
content_items: vec![FunctionCallOutputContentItem::InputAudio {
|
||||
input_audio: InputAudio {
|
||||
data: "BASE64".to_string(),
|
||||
format: "wav".to_string(),
|
||||
},
|
||||
}],
|
||||
stored_values: HashMap::new(),
|
||||
error_text: None,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn audio_helper_strips_data_url_and_derives_format() {
|
||||
let service = CodeModeService::new();
|
||||
|
||||
let response = service
|
||||
.execute(ExecuteRequest {
|
||||
source: r#"audio({ data: "data:audio/mpeg;base64,BASE64" });"#.to_string(),
|
||||
yield_time_ms: None,
|
||||
..execute_request("")
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
response,
|
||||
RuntimeResponse::Result {
|
||||
cell_id: "1".to_string(),
|
||||
content_items: vec![FunctionCallOutputContentItem::InputAudio {
|
||||
input_audio: InputAudio {
|
||||
data: "BASE64".to_string(),
|
||||
format: "mp3".to_string(),
|
||||
},
|
||||
}],
|
||||
stored_values: HashMap::new(),
|
||||
error_text: None,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn audio_helper_accepts_raw_mcp_audio_block() {
|
||||
let service = CodeModeService::new();
|
||||
|
||||
let response = service
|
||||
.execute(ExecuteRequest {
|
||||
source: r#"audio({ type: "audio", data: "BASE64", mimeType: "audio/ogg" });"#
|
||||
.to_string(),
|
||||
yield_time_ms: None,
|
||||
..execute_request("")
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
response,
|
||||
RuntimeResponse::Result {
|
||||
cell_id: "1".to_string(),
|
||||
content_items: vec![FunctionCallOutputContentItem::InputAudio {
|
||||
input_audio: InputAudio {
|
||||
data: "BASE64".to_string(),
|
||||
format: "ogg".to_string(),
|
||||
},
|
||||
}],
|
||||
stored_values: HashMap::new(),
|
||||
error_text: None,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn audio_helper_rejects_bare_string() {
|
||||
let service = CodeModeService::new();
|
||||
|
||||
let response = service
|
||||
.execute(ExecuteRequest {
|
||||
source: r#"audio("BASE64");"#.to_string(),
|
||||
yield_time_ms: None,
|
||||
..execute_request("")
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
response,
|
||||
RuntimeResponse::Result {
|
||||
cell_id: "1".to_string(),
|
||||
content_items: Vec::new(),
|
||||
stored_values: HashMap::new(),
|
||||
error_text: Some(
|
||||
"audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn audio_helper_rejects_non_audio_mime_type() {
|
||||
let service = CodeModeService::new();
|
||||
|
||||
let response = service
|
||||
.execute(ExecuteRequest {
|
||||
source: r#"audio({ data: "BASE64", mimeType: "application/octet-stream" });"#
|
||||
.to_string(),
|
||||
yield_time_ms: None,
|
||||
..execute_request("")
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
response,
|
||||
RuntimeResponse::Result {
|
||||
cell_id: "1".to_string(),
|
||||
content_items: Vec::new(),
|
||||
stored_values: HashMap::new(),
|
||||
error_text: Some(
|
||||
"audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wait_reports_missing_cell_separately_from_runtime_results() {
|
||||
let service = CodeModeService::new();
|
||||
|
||||
Reference in New Issue
Block a user