Support audio input

2026-05-23 04:24:21 +00:00 · 2026-05-15 22:11:50 -07:00
37 changed files with 2112 additions and 110 deletions
--- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json
+++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json
@@ -997,6 +997,26 @@
          ],
          "title": "InputImageFunctionCallOutputContentItem",
          "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
        }
      ]
    },
@@ -1111,6 +1131,21 @@
      ],
      "type": "object"
    },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
    "ListMcpServerStatusParams": {
      "properties": {
        "cursor": {
--- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
+++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
@@ -9137,6 +9137,26 @@
            ],
            "title": "InputImageFunctionCallOutputContentItem",
            "type": "object"
+          },
+          {
+            "properties": {
+              "input_audio": {
+                "$ref": "#/definitions/v2/InputAudio"
+              },
+              "type": {
+                "enum": [
+                  "input_audio"
+                ],
+                "title": "InputAudioFunctionCallOutputContentItemType",
+                "type": "string"
+              }
+            },
+            "required": [
+              "input_audio",
+              "type"
+            ],
+            "title": "InputAudioFunctionCallOutputContentItem",
+            "type": "object"
          }
        ]
      },
@@ -9925,6 +9945,21 @@
        ],
        "type": "string"
      },
+      "InputAudio": {
+        "properties": {
+          "data": {
+            "type": "string"
+          },
+          "format": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "data",
+          "format"
+        ],
+        "type": "object"
+      },
      "InputModality": {
        "description": "Canonical user-input modality tags advertised by a model.",
        "oneOf": [
@@ -9941,6 +9976,13 @@
              "image"
            ],
            "type": "string"
+          },
+          {
+            "description": "Audio content included in tool payloads.",
+            "enum": [
+              "audio"
+            ],
+            "type": "string"
          }
        ]
      },
--- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
+++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
@@ -5526,6 +5526,26 @@
          ],
          "title": "InputImageFunctionCallOutputContentItem",
          "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
        }
      ]
    },
@@ -6474,6 +6494,21 @@
      "title": "InitializeParams",
      "type": "object"
    },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
    "InputModality": {
      "description": "Canonical user-input modality tags advertised by a model.",
      "oneOf": [
@@ -6490,6 +6525,13 @@
            "image"
          ],
          "type": "string"
+        },
+        {
+          "description": "Audio content included in tool payloads.",
+          "enum": [
+            "audio"
+          ],
+          "type": "string"
        }
      ]
    },
--- a/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json
+++ b/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json
@@ -17,6 +17,13 @@
            "image"
          ],
          "type": "string"
+        },
+        {
+          "description": "Audio content included in tool payloads.",
+          "enum": [
+            "audio"
+          ],
+          "type": "string"
        }
      ]
    },
--- a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json
+++ b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json
@@ -140,6 +140,26 @@
          ],
          "title": "InputImageFunctionCallOutputContentItem",
          "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
        }
      ]
    },
@@ -150,6 +170,21 @@
      ],
      "type": "string"
    },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
    "LocalShellAction": {
      "oneOf": [
        {
--- a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json
+++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json
@@ -199,6 +199,26 @@
          ],
          "title": "InputImageFunctionCallOutputContentItem",
          "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
        }
      ]
    },
@@ -209,6 +229,21 @@
      ],
      "type": "string"
    },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
    "LocalShellAction": {
      "oneOf": [
        {
--- a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts
@@ -2,9 +2,10 @@

 // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
 import type { ImageDetail } from "./ImageDetail";
+import type { InputAudio } from "./InputAudio";

 /**
 * Responses API compatible content items that can be returned by a tool call.
 * This is a subset of ContentItem with the types we support as function call outputs.
 */
-export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, };
+export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, } | { "type": "input_audio", input_audio: InputAudio, };
--- a/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts
@@ -0,0 +1,5 @@
+// GENERATED CODE! DO NOT MODIFY BY HAND!
+
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+export type InputAudio = { data: string, format: string, };
--- a/codex-rs/app-server-protocol/schema/typescript/InputModality.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/InputModality.ts
@@ -5,4 +5,4 @@
 /**
 * Canonical user-input modality tags advertised by a model.
 */
-export type InputModality = "text" | "image";
+export type InputModality = "text" | "image" | "audio";
--- a/codex-rs/app-server-protocol/schema/typescript/index.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/index.ts
@@ -36,6 +36,7 @@ export type { ImageDetail } from "./ImageDetail";
 export type { InitializeCapabilities } from "./InitializeCapabilities";
 export type { InitializeParams } from "./InitializeParams";
 export type { InitializeResponse } from "./InitializeResponse";
+export type { InputAudio } from "./InputAudio";
 export type { InputModality } from "./InputModality";
 export type { InternalSessionSource } from "./InternalSessionSource";
 export type { LocalShellAction } from "./LocalShellAction";
--- a/codex-rs/code-mode/src/description.rs
+++ b/codex-rs/code-mode/src/description.rs
@@ -9,7 +9,7 @@ use crate::PUBLIC_TOOL_NAME;
 const MAX_JS_SAFE_INTEGER: u64 = (1_u64 << 53) - 1;
 const DEFERRED_NESTED_TOOLS_GUIDANCE: &str = r#"Some nested MCP/app tools may be omitted from this description. They are still available on the global `tools` object and listed in `ALL_TOOLS`.
 To find one, filter `ALL_TOOLS` by `name` and `description`."#;
-const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/compose tool calls
+const EXEC_DESCRIPTION_TEMPLATE_PREFIX: &str = r#"Run JavaScript code to orchestrate/compose tool calls
 - Evaluates the provided JavaScript code in a fresh V8 isolate as an async module.
 - All nested tools are available on the global `tools` object, for example `await tools.exec_command(...)`. Tool names are exposed as normalized JavaScript identifiers, for example `await tools.mcp__ologs__get_profile(...)`.
 - Nested tool methods take either a string or an object as their input argument.
@@ -24,8 +24,9 @@ const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/co
 - Global helpers:
 - `exit()`: Immediately ends the current script successfully (like an early return from the top level).
 - `text(value: string | number | boolean | undefined | null)`: Appends a text item. Non-string values are stringified with `JSON.stringify(...)` when possible.
- `image(imageUrlOrItem: string | { image_url: string; detail?: "high" | "original" | null } | ImageContent, detail?: "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument.
- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
+- `image(imageUrlOrItem: string | { image_url: string; detail?: "high" | "original" | null } | ImageContent, detail?: "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument."#;
+const AUDIO_HELPER_DESCRIPTION: &str = r#"- `audio(audioItem: { data: string; format?: string | null; mimeType?: string | null; mime_type?: string | null } | AudioContent)`: Appends an audio item. `data` can be raw base64 audio or a base64-encoded `data:audio/...` URL. To forward an MCP tool audio block, pass an individual `AudioContent` block from `result.content`, for example `audio(result.content[0])`."#;
+const EXEC_DESCRIPTION_TEMPLATE_SUFFIX: &str = r#"- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
 - `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing.
 - `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`.
 - `setTimeout(callback: () => void, delayMs?: number)`: schedules a callback to run later and returns a timeout id. Pending timeouts do not keep `exec` alive by themselves; await an explicit promise if you need to wait for one.
@@ -41,7 +42,7 @@ const WAIT_DESCRIPTION_TEMPLATE: &str = r#"- Use `wait` only after `exec` return
 - If the cell is still running, `wait` may yield again with the same `cell_id`.
 - If the cell has already finished, `wait` returns the completed result and closes the cell."#;
 // Based off of https://modelcontextprotocol.io/specification/draft/schema#calltoolresult
-const MCP_TYPESCRIPT_PREAMBLE: &str = r#"type Role = "user" | "assistant";
+const MCP_TYPESCRIPT_PREAMBLE_PREFIX: &str = r#"type Role = "user" | "assistant";
 type MetaObject = Record<string, unknown>;
 type Annotations = {
  audience?: Role[];
@@ -79,14 +80,16 @@ type ImageContent = {
  annotations?: Annotations;
  _meta?: MetaObject;
 };
-type AudioContent = {
+"#;
+const MCP_AUDIO_CONTENT_TYPE: &str = r#"type AudioContent = {
  type: "audio";
  data: string;
  mimeType: string;
  annotations?: Annotations;
  _meta?: MetaObject;
 };
-type ResourceLink = {
+"#;
+const MCP_TYPESCRIPT_PREAMBLE_SUFFIX: &str = r#"type ResourceLink = {
  icons?: Icon[];
  name: string;
  title?: string;
@@ -106,8 +109,10 @@ type EmbeddedResource = {
 };
 type ContentBlock =
  | TextContent
-  | ImageContent
-  | AudioContent
+  | ImageContent"#;
+const MCP_AUDIO_CONTENT_BLOCK_VARIANT: &str = r#"
+  | AudioContent"#;
+const MCP_TYPESCRIPT_PREAMBLE_END: &str = r#"
  | ResourceLink
  | EmbeddedResource;
 type CallToolResult<TStructured = { [key: string]: unknown }> = {
@@ -143,6 +148,13 @@ pub struct ToolNamespaceDescription {
    pub description: String,
 }

+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub struct ExecToolDescriptionOptions {
+    pub code_mode_only: bool,
+    pub deferred_tools_available: bool,
+    pub supports_audio_input: bool,
+}
+
 #[derive(Debug, Default, Deserialize, PartialEq, Eq)]
 #[serde(deny_unknown_fields)]
 struct CodeModeExecPragma {
@@ -250,15 +262,21 @@ pub fn is_code_mode_nested_tool(tool_name: &str) -> bool {
 pub fn build_exec_tool_description(
    enabled_tools: &[ToolDefinition],
    namespace_descriptions: &BTreeMap<String, ToolNamespaceDescription>,
-    code_mode_only: bool,
-    deferred_tools_available: bool,
+    options: ExecToolDescriptionOptions,
 ) -> String {
    let mut sections = Vec::new();
-    sections.push(EXEC_DESCRIPTION_TEMPLATE.to_string());
-    if deferred_tools_available {
+    let mut exec_description = String::from(EXEC_DESCRIPTION_TEMPLATE_PREFIX);
+    if options.supports_audio_input {
+        exec_description.push('\n');
+        exec_description.push_str(AUDIO_HELPER_DESCRIPTION);
+    }
+    exec_description.push('\n');
+    exec_description.push_str(EXEC_DESCRIPTION_TEMPLATE_SUFFIX);
+    sections.push(exec_description);
+    if options.deferred_tools_available {
        sections.push(DEFERRED_NESTED_TOOLS_GUIDANCE.to_string());
    }
-    if !code_mode_only {
+    if !options.code_mode_only {
        return sections.join("\n\n");
    }

@@ -305,8 +323,18 @@ pub fn build_exec_tool_description(
        }

        if has_mcp_tools {
+            let mut mcp_typescript_preamble = String::from(MCP_TYPESCRIPT_PREAMBLE_PREFIX);
+            if options.supports_audio_input {
+                mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_TYPE);
+            }
+            mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_SUFFIX);
+            if options.supports_audio_input {
+                mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_BLOCK_VARIANT);
+            }
+            mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_END);
+
            sections.push(format!(
-                "Shared MCP Types:\n```ts\n{MCP_TYPESCRIPT_PREAMBLE}\n```"
+                "Shared MCP Types:\n```ts\n{mcp_typescript_preamble}\n```"
            ));
        }
        let nested_tool_reference = nested_tool_sections.join("\n\n");
@@ -706,6 +734,7 @@ fn render_json_schema_literal(value: &JsonValue) -> String {
 #[cfg(test)]
 mod tests {
    use super::CodeModeToolKind;
+    use super::ExecToolDescriptionOptions;
    use super::ParsedExecSource;
    use super::ToolDefinition;
    use super::ToolNamespaceDescription;
@@ -863,8 +892,11 @@ mod tests {
                output_schema: None,
            }],
            &BTreeMap::new(),
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
        );
        assert!(description.contains(
            "### `foo`
@@ -878,13 +910,41 @@ bar"
        let description = build_exec_tool_description(
            &[],
            &BTreeMap::new(),
-            /*code_mode_only*/ false,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
        );
        assert!(description.contains("`setTimeout(callback: () => void, delayMs?: number)`"));
        assert!(description.contains("`clearTimeout(timeoutId?: number)`"));
    }

+    #[test]
+    fn exec_description_gates_audio_helper_on_audio_input_support() {
+        let unsupported_description = build_exec_tool_description(
+            &[],
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
+        );
+        assert!(!unsupported_description.contains("`audio(audioItem"));
+
+        let supported_description = build_exec_tool_description(
+            &[],
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: false,
+                supports_audio_input: true,
+            },
+        );
+        assert!(supported_description.contains("`audio(audioItem"));
+    }
+
    #[test]
    fn code_mode_only_description_groups_namespace_instructions_once() {
        let namespace_descriptions = BTreeMap::from([(
@@ -930,8 +990,11 @@ bar"
                },
            ],
            &namespace_descriptions,
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
        );
        assert_eq!(description.matches("## mcp__sample").count(), 1);
        assert!(description.contains("## mcp__sample\nShared namespace guidance."));
@@ -970,8 +1033,11 @@ bar"
                }))),
            }],
            &namespace_descriptions,
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
        );

        assert!(!description.contains("## mcp__sample"));
@@ -1069,8 +1135,11 @@ bar"
                },
            ],
            &BTreeMap::new(),
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
        );

        assert_eq!(
@@ -1082,13 +1151,60 @@ bar"
        assert_eq!(description.matches("Shared MCP Types:").count(), 1);
    }

+    #[test]
+    fn code_mode_only_description_gates_mcp_audio_type_on_audio_input_support() {
+        let tools = vec![ToolDefinition {
+            name: "mcp__sample__audio".to_string(),
+            tool_name: ToolName::namespaced("mcp__sample__", "audio"),
+            description: "Audio tool".to_string(),
+            kind: CodeModeToolKind::Function,
+            input_schema: Some(json!({
+                "type": "object",
+                "properties": {},
+                "additionalProperties": false
+            })),
+            output_schema: Some(mcp_call_tool_result_schema(json!({
+                "type": "object",
+                "properties": {},
+                "additionalProperties": false
+            }))),
+        }];
+
+        let unsupported_description = build_exec_tool_description(
+            &tools,
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
+        );
+        assert!(!unsupported_description.contains("type AudioContent"));
+        assert!(!unsupported_description.contains("| AudioContent"));
+
+        let supported_description = build_exec_tool_description(
+            &tools,
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: true,
+            },
+        );
+        assert!(supported_description.contains("type AudioContent"));
+        assert!(supported_description.contains("| AudioContent"));
+    }
+
    #[test]
    fn exec_description_mentions_deferred_nested_tools_when_available() {
        let description = build_exec_tool_description(
            &[],
            &BTreeMap::new(),
-            /*code_mode_only*/ false,
-            /*deferred_tools_available*/ true,
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: true,
+                supports_audio_input: false,
+            },
        );

        assert!(description.contains("Some nested MCP/app tools may be omitted"));
--- a/codex-rs/code-mode/src/lib.rs
+++ b/codex-rs/code-mode/src/lib.rs
@@ -5,6 +5,7 @@ mod service;

 pub use description::CODE_MODE_PRAGMA_PREFIX;
 pub use description::CodeModeToolKind;
+pub use description::ExecToolDescriptionOptions;
 pub use description::ToolDefinition;
 pub use description::ToolNamespaceDescription;
 pub use description::augment_tool_definition;
@@ -18,6 +19,7 @@ pub use description::render_json_schema_to_typescript;
 pub use response::DEFAULT_IMAGE_DETAIL;
 pub use response::FunctionCallOutputContentItem;
 pub use response::ImageDetail;
+pub use response::InputAudio;
 pub use runtime::CodeModeNestedToolCall;
 pub use runtime::DEFAULT_EXEC_YIELD_TIME_MS;
 pub use runtime::DEFAULT_MAX_OUTPUT_TOKENS_PER_EXEC_CALL;
--- a/codex-rs/code-mode/src/response.rs
+++ b/codex-rs/code-mode/src/response.rs
@@ -21,4 +21,13 @@ pub enum FunctionCallOutputContentItem {
        #[serde(default, skip_serializing_if = "Option::is_none")]
        detail: Option<ImageDetail>,
    },
+    InputAudio {
+        input_audio: InputAudio,
+    },
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct InputAudio {
+    pub data: String,
+    pub format: String,
 }
--- a/codex-rs/code-mode/src/runtime/callbacks.rs
+++ b/codex-rs/code-mode/src/runtime/callbacks.rs
@@ -5,6 +5,7 @@ use super::RuntimeEvent;
 use super::RuntimeState;
 use super::timers;
 use super::value::json_to_v8;
+use super::value::normalize_output_audio;
 use super::value::normalize_output_image;
 use super::value::serialize_output_text;
 use super::value::throw_type_error;
@@ -129,6 +130,26 @@ pub(super) fn image_callback(
    retval.set(v8::undefined(scope).into());
 }

+pub(super) fn audio_callback(
+    scope: &mut v8::PinScope<'_, '_>,
+    args: v8::FunctionCallbackArguments,
+    mut retval: v8::ReturnValue<v8::Value>,
+) {
+    let value = if args.length() == 0 {
+        v8::undefined(scope).into()
+    } else {
+        args.get(0)
+    };
+    let audio_item = match normalize_output_audio(scope, value) {
+        Ok(audio_item) => audio_item,
+        Err(()) => return,
+    };
+    if let Some(state) = scope.get_slot::<RuntimeState>() {
+        let _ = state.event_tx.send(RuntimeEvent::ContentItem(audio_item));
+    }
+    retval.set(v8::undefined(scope).into());
+}
+
 pub(super) fn store_callback(
    scope: &mut v8::PinScope<'_, '_>,
    args: v8::FunctionCallbackArguments,
--- a/codex-rs/code-mode/src/runtime/globals.rs
+++ b/codex-rs/code-mode/src/runtime/globals.rs
@@ -1,4 +1,5 @@
 use super::RuntimeState;
+use super::callbacks::audio_callback;
 use super::callbacks::clear_timeout_callback;
 use super::callbacks::exit_callback;
 use super::callbacks::image_callback;
@@ -23,6 +24,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
    let set_timeout = helper_function(scope, "setTimeout", set_timeout_callback)?;
    let text = helper_function(scope, "text", text_callback)?;
    let image = helper_function(scope, "image", image_callback)?;
+    let audio = helper_function(scope, "audio", audio_callback)?;
    let store = helper_function(scope, "store", store_callback)?;
    let load = helper_function(scope, "load", load_callback)?;
    let notify = helper_function(scope, "notify", notify_callback)?;
@@ -35,6 +37,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
    set_global(scope, global, "setTimeout", set_timeout.into())?;
    set_global(scope, global, "text", text.into())?;
    set_global(scope, global, "image", image.into())?;
+    set_global(scope, global, "audio", audio.into())?;
    set_global(scope, global, "store", store.into())?;
    set_global(scope, global, "load", load.into())?;
    set_global(scope, global, "notify", notify.into())?;
--- a/codex-rs/code-mode/src/runtime/value.rs
+++ b/codex-rs/code-mode/src/runtime/value.rs
@@ -3,8 +3,10 @@ use serde_json::Value as JsonValue;
 use crate::response::DEFAULT_IMAGE_DETAIL;
 use crate::response::FunctionCallOutputContentItem;
 use crate::response::ImageDetail;
+use crate::response::InputAudio;

 const IMAGE_HELPER_EXPECTS_MESSAGE: &str = "image expects a non-empty image URL string, an object with image_url and optional detail, or a raw MCP image block";
+const AUDIO_HELPER_EXPECTS_MESSAGE: &str = "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block";
 const CODEX_IMAGE_DETAIL_META_KEY: &str = "codex/imageDetail";

 pub(super) fn serialize_output_text(
@@ -93,6 +95,35 @@ pub(super) fn normalize_output_image(
    }
 }

+pub(super) fn normalize_output_audio(
+    scope: &mut v8::PinScope<'_, '_>,
+    value: v8::Local<'_, v8::Value>,
+) -> Result<FunctionCallOutputContentItem, ()> {
+    let result = (|| -> Result<FunctionCallOutputContentItem, String> {
+        if !value.is_object() || value.is_array() {
+            return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+        }
+
+        let object = v8::Local::<v8::Object>::try_from(value)
+            .map_err(|_| AUDIO_HELPER_EXPECTS_MESSAGE.to_string())?;
+        let input_audio = if let Some(audio) = parse_non_mcp_output_audio(scope, object)? {
+            audio
+        } else {
+            parse_mcp_output_audio(scope, value)?
+        };
+
+        Ok(FunctionCallOutputContentItem::InputAudio { input_audio })
+    })();
+
+    match result {
+        Ok(item) => Ok(item),
+        Err(error_text) => {
+            throw_type_error(scope, &error_text);
+            Err(())
+        }
+    }
+}
+
 fn parse_non_mcp_output_image(
    scope: &mut v8::PinScope<'_, '_>,
    object: v8::Local<'_, v8::Object>,
@@ -161,6 +192,90 @@ fn parse_mcp_output_image(
    Ok((image_url, detail))
 }

+fn parse_non_mcp_output_audio(
+    scope: &mut v8::PinScope<'_, '_>,
+    object: v8::Local<'_, v8::Object>,
+) -> Result<Option<InputAudio>, String> {
+    let data_key = v8::String::new(scope, "data")
+        .ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
+    let Some(data) = object.get(scope, data_key.into()) else {
+        return Ok(None);
+    };
+    if data.is_undefined() {
+        return Ok(None);
+    }
+    if !data.is_string() {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    }
+    let data = data.to_rust_string_lossy(scope);
+    let format = optional_string_property(scope, object, "format")?;
+    let mime_type = optional_string_property(scope, object, "mimeType")?
+        .or(optional_string_property(scope, object, "mime_type")?);
+    let Some(input_audio) = codex_protocol::models::input_audio_from_data(
+        &data,
+        format.as_deref(),
+        mime_type.as_deref(),
+    ) else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    Ok(Some(InputAudio {
+        data: input_audio.data,
+        format: input_audio.format,
+    }))
+}
+
+fn parse_mcp_output_audio(
+    scope: &mut v8::PinScope<'_, '_>,
+    value: v8::Local<'_, v8::Value>,
+) -> Result<InputAudio, String> {
+    let Some(result) = v8_value_to_json(scope, value)? else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    let JsonValue::Object(result) = result else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    let Some(item_type) = result.get("type").and_then(JsonValue::as_str) else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    if item_type != "audio" {
+        return Err(format!(
+            "audio only accepts MCP audio blocks, got \"{item_type}\""
+        ));
+    }
+    let data = result
+        .get("data")
+        .and_then(JsonValue::as_str)
+        .ok_or_else(|| "audio expected MCP audio data".to_string())?;
+    let mime_type = result
+        .get("mimeType")
+        .or_else(|| result.get("mime_type"))
+        .and_then(JsonValue::as_str);
+    let Some(input_audio) =
+        codex_protocol::models::input_audio_from_data(data, /*format*/ None, mime_type)
+    else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    Ok(InputAudio {
+        data: input_audio.data,
+        format: input_audio.format,
+    })
+}
+
+fn optional_string_property(
+    scope: &mut v8::PinScope<'_, '_>,
+    object: v8::Local<'_, v8::Object>,
+    name: &str,
+) -> Result<Option<String>, String> {
+    let key = v8::String::new(scope, name)
+        .ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
+    match object.get(scope, key.into()) {
+        Some(value) if value.is_string() => Ok(Some(value.to_rust_string_lossy(scope))),
+        Some(value) if value.is_null() || value.is_undefined() => Ok(None),
+        Some(_) => Err(format!("{name} must be a string when provided")),
+        None => Ok(None),
+    }
+}
+
 fn parse_image_detail_value<'s>(
    scope: &mut v8::PinScope<'s, '_>,
    value: Option<v8::Local<'s, v8::Value>>,
--- a/codex-rs/code-mode/src/service.rs
+++ b/codex-rs/code-mode/src/service.rs
@@ -703,6 +703,7 @@ mod tests {
    use super::run_session_control;
    use crate::CodeModeToolKind;
    use crate::FunctionCallOutputContentItem;
+    use crate::InputAudio;
    use crate::ToolDefinition;
    use crate::runtime::ExecuteRequest;
    use crate::runtime::ExecuteToPendingOutcome;
@@ -1230,6 +1231,7 @@ text(formatter.format(new Date("2025-01-02T03:04:05Z")));
 const returnsUndefined = [
  text("first"),
  image("https://example.com/image.jpg"),
+  audio({ data: "BASE64", format: "wav" }),
  notify("ping"),
 ].map((value) => value === undefined);
 text(JSON.stringify(returnsUndefined));
@@ -1253,8 +1255,14 @@ text(JSON.stringify(returnsUndefined));
                        image_url: "https://example.com/image.jpg".to_string(),
                        detail: Some(crate::DEFAULT_IMAGE_DETAIL),
                    },
+                    FunctionCallOutputContentItem::InputAudio {
+                        input_audio: InputAudio {
+                            data: "BASE64".to_string(),
+                            format: "wav".to_string(),
+                        },
+                    },
                    FunctionCallOutputContentItem::InputText {
-                        text: "[true,true,true]".to_string(),
+                        text: "[true,true,true,true]".to_string(),
                    },
                ],
                stored_values: HashMap::new(),
@@ -1441,6 +1449,147 @@ image({
        );
    }

+    #[tokio::test]
+    async fn audio_helper_accepts_explicit_object() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ data: "BASE64", format: "wav" });"#.to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: vec![FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".to_string(),
+                        format: "wav".to_string(),
+                    },
+                }],
+                stored_values: HashMap::new(),
+                error_text: None,
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_strips_data_url_and_derives_format() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ data: "data:audio/mpeg;base64,BASE64" });"#.to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: vec![FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".to_string(),
+                        format: "mp3".to_string(),
+                    },
+                }],
+                stored_values: HashMap::new(),
+                error_text: None,
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_accepts_raw_mcp_audio_block() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ type: "audio", data: "BASE64", mimeType: "audio/ogg" });"#
+                    .to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: vec![FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".to_string(),
+                        format: "ogg".to_string(),
+                    },
+                }],
+                stored_values: HashMap::new(),
+                error_text: None,
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_rejects_bare_string() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio("BASE64");"#.to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: Vec::new(),
+                stored_values: HashMap::new(),
+                error_text: Some(
+                    "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
+                ),
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_rejects_non_audio_mime_type() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ data: "BASE64", mimeType: "application/octet-stream" });"#
+                    .to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: Vec::new(),
+                stored_values: HashMap::new(),
+                error_text: Some(
+                    "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
+                ),
+            }
+        );
+    }
+
    #[tokio::test]
    async fn wait_reports_missing_cell_separately_from_runtime_results() {
        let service = CodeModeService::new();
--- a/codex-rs/core/src/context_manager/history.rs
+++ b/codex-rs/core/src/context_manager/history.rs
@@ -113,9 +113,8 @@ impl ContextManager {
    }

    /// Returns the history prepared for sending to the model. This applies a proper
-    /// normalization and drops un-suited items. When `input_modalities` does not
-    /// include `InputModality::Image`, images are stripped from messages and tool
-    /// outputs.
+    /// normalization and drops un-suited items. Unsupported media content is
+    /// stripped from messages and tool outputs according to `input_modalities`.
    pub(crate) fn for_prompt(mut self, input_modalities: &[InputModality]) -> Vec<ResponseItem> {
        self.normalize_history(input_modalities);
        self.items
@@ -365,8 +364,8 @@ impl ContextManager {
        // all outputs must have a corresponding function/tool call
        normalize::remove_orphan_outputs(&mut self.items);

-        // strip images when model does not support them
-        normalize::strip_images_when_unsupported(input_modalities, &mut self.items);
+        // strip unsupported media content before sending history to the model
+        normalize::strip_unsupported_media_content(input_modalities, &mut self.items);
    }

    fn process_item(&self, item: &ResponseItem, policy: TruncationPolicy) -> ResponseItem {
--- a/codex-rs/core/src/context_manager/history_tests.rs
+++ b/codex-rs/core/src/context_manager/history_tests.rs
@@ -10,6 +10,7 @@ use codex_protocol::models::FunctionCallOutputBody;
 use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ImageDetail;
+use codex_protocol::models::InputAudio;
 use codex_protocol::models::LocalShellAction;
 use codex_protocol::models::LocalShellExecAction;
 use codex_protocol::models::LocalShellStatus;
@@ -513,6 +514,85 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
    }
 }

+#[test]
+fn for_prompt_strips_audio_when_model_does_not_support_audio() {
+    let items = vec![
+        ResponseItem::FunctionCall {
+            id: None,
+            name: "audio_tool".to_string(),
+            namespace: None,
+            arguments: "{}".to_string(),
+            call_id: "call-1".to_string(),
+        },
+        ResponseItem::FunctionCallOutput {
+            call_id: "call-1".to_string(),
+            output: FunctionCallOutputPayload::from_content_items(vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "audio result".to_string(),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "UklGRg==".to_string(),
+                        format: "wav".to_string(),
+                    },
+                },
+            ]),
+        },
+    ];
+    let history = create_history_with_items(items);
+    let default_modalities = default_input_modalities();
+    let stripped = history.clone().for_prompt(&default_modalities);
+
+    assert_eq!(
+        stripped,
+        vec![
+            ResponseItem::FunctionCall {
+                id: None,
+                name: "audio_tool".to_string(),
+                namespace: None,
+                arguments: "{}".to_string(),
+                call_id: "call-1".to_string(),
+            },
+            ResponseItem::FunctionCallOutput {
+                call_id: "call-1".to_string(),
+                output: FunctionCallOutputPayload::from_content_items(vec![
+                    FunctionCallOutputContentItem::InputText {
+                        text: "audio result".to_string(),
+                    },
+                    FunctionCallOutputContentItem::InputText {
+                        text: "audio content omitted because you do not support audio input"
+                            .to_string(),
+                    },
+                ]),
+            },
+        ]
+    );
+
+    let audio_modalities = vec![
+        InputModality::Text,
+        InputModality::Image,
+        InputModality::Audio,
+    ];
+    let with_audio = history.for_prompt(&audio_modalities);
+    assert_eq!(
+        with_audio[1],
+        ResponseItem::FunctionCallOutput {
+            call_id: "call-1".to_string(),
+            output: FunctionCallOutputPayload::from_content_items(vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "audio result".to_string(),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "UklGRg==".to_string(),
+                        format: "wav".to_string(),
+                    },
+                },
+            ]),
+        }
+    );
+}
+
 #[test]
 fn for_prompt_preserves_image_generation_calls_when_images_are_supported() {
    let history = create_history_with_items(vec![
@@ -1048,6 +1128,46 @@ fn record_items_truncates_function_call_output_content() {
    }
 }

+#[test]
+fn record_items_omits_over_budget_audio_content() {
+    let mut history = ContextManager::new();
+    let audio_data = "A".repeat(1_000);
+    let item = ResponseItem::FunctionCallOutput {
+        call_id: "call-audio".to_string(),
+        output: FunctionCallOutputPayload::from_content_items(vec![
+            FunctionCallOutputContentItem::InputAudio {
+                input_audio: InputAudio {
+                    data: audio_data.clone(),
+                    format: "wav".to_string(),
+                },
+            },
+        ]),
+    };
+
+    history.record_items([&item], TruncationPolicy::Bytes(32));
+
+    assert_eq!(history.items.len(), 1);
+    match &history.items[0] {
+        ResponseItem::FunctionCallOutput { output, .. } => {
+            assert_eq!(
+                output,
+                &FunctionCallOutputPayload::from_content_items(vec![
+                    FunctionCallOutputContentItem::InputText {
+                        text:
+                            "[omitted 1 audio item because its size exceeds the output truncation budget]"
+                                .to_string(),
+                    },
+                ])
+            );
+            assert!(
+                !format!("{output:?}").contains(&audio_data),
+                "over-budget audio data should not be retained"
+            );
+        }
+        other => panic!("unexpected history item: {other:?}"),
+    }
+}
+
 #[test]
 fn record_items_truncates_custom_tool_call_output_content() {
    let mut history = ContextManager::new();
--- a/codex-rs/core/src/context_manager/normalize.rs
+++ b/codex-rs/core/src/context_manager/normalize.rs
@@ -10,6 +10,8 @@ use tracing::info;

 const IMAGE_CONTENT_OMITTED_PLACEHOLDER: &str =
    "image content omitted because you do not support image input";
+const AUDIO_CONTENT_OMITTED_PLACEHOLDER: &str =
+    "audio content omitted because you do not support audio input";

 pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
    // Collect synthetic outputs to insert immediately after their calls.
@@ -290,14 +292,14 @@ where
    }
 }

-/// Strip image content from messages and tool outputs when the model does not support images.
-/// When `input_modalities` contains `InputModality::Image`, no stripping is performed.
-pub(crate) fn strip_images_when_unsupported(
+/// Strip unsupported media content from messages and tool outputs.
+pub(crate) fn strip_unsupported_media_content(
    input_modalities: &[InputModality],
    items: &mut [ResponseItem],
 ) {
    let supports_images = input_modalities.contains(&InputModality::Image);
-    if supports_images {
+    let supports_audio = input_modalities.contains(&InputModality::Audio);
+    if supports_images && supports_audio {
        return;
    }

@@ -307,7 +309,7 @@ pub(crate) fn strip_images_when_unsupported(
                let mut normalized_content = Vec::with_capacity(content.len());
                for content_item in content.iter() {
                    match content_item {
-                        ContentItem::InputImage { .. } => {
+                        ContentItem::InputImage { .. } if !supports_images => {
                            normalized_content.push(ContentItem::InputText {
                                text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(),
                            });
@@ -323,20 +325,29 @@ pub(crate) fn strip_images_when_unsupported(
                    let mut normalized_content_items = Vec::with_capacity(content_items.len());
                    for content_item in content_items.iter() {
                        match content_item {
-                            FunctionCallOutputContentItem::InputImage { .. } => {
+                            FunctionCallOutputContentItem::InputImage { .. }
+                                if !supports_images =>
+                            {
                                normalized_content_items.push(
                                    FunctionCallOutputContentItem::InputText {
                                        text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(),
                                    },
                                );
                            }
+                            FunctionCallOutputContentItem::InputAudio { .. } if !supports_audio => {
+                                normalized_content_items.push(
+                                    FunctionCallOutputContentItem::InputText {
+                                        text: AUDIO_CONTENT_OMITTED_PLACEHOLDER.to_string(),
+                                    },
+                                );
+                            }
                            _ => normalized_content_items.push(content_item.clone()),
                        }
                    }
                    *content_items = normalized_content_items;
                }
            }
-            ResponseItem::ImageGenerationCall { result, .. } => {
+            ResponseItem::ImageGenerationCall { result, .. } if !supports_images => {
                result.clear();
            }
            _ => {}
--- a/codex-rs/core/src/mcp_tool_call.rs
+++ b/codex-rs/core/src/mcp_tool_call.rs
@@ -582,13 +582,8 @@ async fn execute_mcp_tool_call(
        )
        .await
        .map_err(|e| format!("tool call error: {e:?}"))?;
-    let result = sanitize_mcp_tool_result_for_model(
-        turn_context
-            .model_info
-            .input_modalities
-            .contains(&InputModality::Image),
-        Ok(result),
-    )?;
+    let result =
+        sanitize_mcp_tool_result_for_model(&turn_context.model_info.input_modalities, Ok(result))?;
    Ok(maybe_request_codex_apps_auth_elicitation(
        sess,
        turn_context,
@@ -776,36 +771,61 @@ async fn maybe_mark_thread_memory_mode_polluted(
 }

 fn sanitize_mcp_tool_result_for_model(
-    supports_image_input: bool,
+    input_modalities: &[InputModality],
    result: Result<CallToolResult, String>,
 ) -> Result<CallToolResult, String> {
-    if supports_image_input {
-        return result;
-    }
+    let supports_image_input = input_modalities.contains(&InputModality::Image);
+    let supports_audio_input = input_modalities.contains(&InputModality::Audio);

-    result.map(|call_tool_result| CallToolResult {
-        content: call_tool_result
-            .content
-            .iter()
-            .map(|block| {
-                if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
-                    && content_type == "image"
-                {
-                    return serde_json::json!({
-                        "type": "text",
-                        "text": "<image content omitted because you do not support image input>",
-                    });
-                }
+    result.and_then(|call_tool_result| {
+        if !supports_audio_input
+            && !has_non_null_structured_content(&call_tool_result)
+            && call_tool_result
+                .content
+                .iter()
+                .any(|block| block.get("type").and_then(serde_json::Value::as_str) == Some("audio"))
+        {
+            return Err(
+                "audio content returned by MCP tool but the selected model does not support audio input"
+                    .to_string(),
+            );
+        }

-                block.clone()
-            })
-            .collect::<Vec<_>>(),
-        structured_content: call_tool_result.structured_content,
-        is_error: call_tool_result.is_error,
-        meta: call_tool_result.meta,
+        if supports_image_input {
+            return Ok(call_tool_result);
+        }
+
+        Ok(CallToolResult {
+            content: call_tool_result
+                .content
+                .iter()
+                .map(|block| {
+                    if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
+                        && content_type == "image"
+                    {
+                        return serde_json::json!({
+                            "type": "text",
+                            "text": "<image content omitted because you do not support image input>",
+                        });
+                    }
+
+                    block.clone()
+                })
+                .collect::<Vec<_>>(),
+            structured_content: call_tool_result.structured_content,
+            is_error: call_tool_result.is_error,
+            meta: call_tool_result.meta,
+        })
    })
 }

+fn has_non_null_structured_content(call_tool_result: &CallToolResult) -> bool {
+    call_tool_result
+        .structured_content
+        .as_ref()
+        .is_some_and(|structured_content| !structured_content.is_null())
+}
+
 fn truncate_mcp_tool_result_for_event(
    result: &Result<CallToolResult, String>,
 ) -> Result<CallToolResult, String> {
--- a/codex-rs/core/src/mcp_tool_call_tests.rs
+++ b/codex-rs/core/src/mcp_tool_call_tests.rs
@@ -924,7 +924,7 @@ fn sanitize_mcp_tool_result_for_model_rewrites_image_content() {
        meta: None,
    });

-    let got = sanitize_mcp_tool_result_for_model(/*supports_image_input*/ false, result)
+    let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result)
        .expect("sanitized result");

    assert_eq!(
@@ -956,7 +956,7 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
    };

    let got = sanitize_mcp_tool_result_for_model(
-        /*supports_image_input*/ true,
+        &[InputModality::Text, InputModality::Image],
        Ok(original.clone()),
    )
    .expect("unsanitized result");
@@ -964,6 +964,73 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
    assert_eq!(got, original);
 }

+#[test]
+fn sanitize_mcp_tool_result_for_model_rejects_audio_when_unsupported() {
+    let result = Ok(CallToolResult {
+        content: vec![serde_json::json!({
+            "type": "audio",
+            "data": "UklGRg==",
+            "mimeType": "audio/wav",
+        })],
+        structured_content: None,
+        is_error: Some(false),
+        meta: None,
+    });
+
+    let err = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result)
+        .expect_err("unsupported audio should fail");
+
+    assert_eq!(
+        err,
+        "audio content returned by MCP tool but the selected model does not support audio input"
+    );
+}
+
+#[test]
+fn sanitize_mcp_tool_result_for_model_preserves_audio_when_supported() {
+    let original = CallToolResult {
+        content: vec![serde_json::json!({
+            "type": "audio",
+            "data": "UklGRg==",
+            "mimeType": "audio/wav",
+        })],
+        structured_content: None,
+        is_error: Some(false),
+        meta: Some(serde_json::json!({"k": "v"})),
+    };
+
+    let got = sanitize_mcp_tool_result_for_model(
+        &[
+            InputModality::Text,
+            InputModality::Image,
+            InputModality::Audio,
+        ],
+        Ok(original.clone()),
+    )
+    .expect("supported audio should remain unchanged");
+
+    assert_eq!(got, original);
+}
+
+#[test]
+fn sanitize_mcp_tool_result_for_model_lets_structured_content_take_precedence_over_audio() {
+    let original = CallToolResult {
+        content: vec![serde_json::json!({
+            "type": "audio",
+            "data": "UklGRg==",
+            "mimeType": "audio/wav",
+        })],
+        structured_content: Some(serde_json::json!({"answer": "structured"})),
+        is_error: Some(false),
+        meta: None,
+    };
+
+    let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], Ok(original.clone()))
+        .expect("structured content should take precedence");
+
+    assert_eq!(got, original);
+}
+
 #[test]
 fn truncate_mcp_tool_result_for_event_preserves_small_result() {
    let original = CallToolResult {
--- a/codex-rs/core/src/tools/code_mode/execute_spec.rs
+++ b/codex-rs/core/src/tools/code_mode/execute_spec.rs
@@ -7,8 +7,7 @@ use std::collections::BTreeMap;
 pub(crate) fn create_code_mode_tool(
    enabled_tools: &[CodeModeToolDefinition],
    namespace_descriptions: &BTreeMap<String, codex_code_mode::ToolNamespaceDescription>,
-    code_mode_only: bool,
-    deferred_tools_available: bool,
+    options: codex_code_mode::ExecToolDescriptionOptions,
 ) -> ToolSpec {
    const CODE_MODE_FREEFORM_GRAMMAR: &str = r#"
 start: pragma_source | plain_source
@@ -25,8 +24,7 @@ SOURCE: /[\s\S]+/
        description: codex_code_mode::build_exec_tool_description(
            enabled_tools,
            namespace_descriptions,
-            code_mode_only,
-            deferred_tools_available,
+            options,
        ),
        format: FreeformToolFormat {
            r#type: "grammar".to_string(),
@@ -57,16 +55,22 @@ mod tests {
            create_code_mode_tool(
                &enabled_tools,
                &BTreeMap::new(),
-                /*code_mode_only*/ true,
-                /*deferred_tools_available*/ false,
+                codex_code_mode::ExecToolDescriptionOptions {
+                    code_mode_only: true,
+                    deferred_tools_available: false,
+                    supports_audio_input: false,
+                },
            ),
            ToolSpec::Freeform(FreeformTool {
                name: codex_code_mode::PUBLIC_TOOL_NAME.to_string(),
                description: codex_code_mode::build_exec_tool_description(
                    &enabled_tools,
                    &BTreeMap::new(),
-                    /*code_mode_only*/ true,
-                    /*deferred_tools_available*/ false
+                    codex_code_mode::ExecToolDescriptionOptions {
+                        code_mode_only: true,
+                        deferred_tools_available: false,
+                        supports_audio_input: false,
+                    }
                ),
                format: FreeformToolFormat {
                    r#type: "grammar".to_string(),
--- a/codex-rs/core/src/tools/code_mode/mod.rs
+++ b/codex-rs/core/src/tools/code_mode/mod.rs
@@ -14,6 +14,7 @@ use codex_code_mode::RuntimeResponse;
 use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ResponseInputItem;
+use codex_protocol::openai_models::InputModality;
 use serde_json::Value as JsonValue;
 use tokio_util::sync::CancellationToken;

@@ -168,6 +169,9 @@ pub(super) async fn handle_runtime_response(
    match response {
        RuntimeResponse::Yielded { content_items, .. } => {
            let mut content_items = into_function_call_output_content_items(content_items);
+            if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
+                return Ok(output);
+            }
            sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
            content_items = truncate_code_mode_result(content_items, max_output_tokens);
            prepend_script_status(&mut content_items, &script_status, started_at.elapsed());
@@ -175,6 +179,9 @@ pub(super) async fn handle_runtime_response(
        }
        RuntimeResponse::Terminated { content_items, .. } => {
            let mut content_items = into_function_call_output_content_items(content_items);
+            if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
+                return Ok(output);
+            }
            sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
            content_items = truncate_code_mode_result(content_items, max_output_tokens);
            prepend_script_status(&mut content_items, &script_status, started_at.elapsed());
@@ -187,12 +194,15 @@ pub(super) async fn handle_runtime_response(
            ..
        } => {
            let mut content_items = into_function_call_output_content_items(content_items);
-            sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
            exec.session
                .services
                .code_mode_service
                .replace_stored_values(stored_values)
                .await;
+            if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
+                return Ok(output);
+            }
+            sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
            let success = error_text.is_none();
            if let Some(error_text) = error_text {
                content_items.push(FunctionCallOutputContentItem::InputText {
@@ -209,6 +219,29 @@ pub(super) async fn handle_runtime_response(
    }
 }

+fn unsupported_audio_output(
+    turn: &TurnContext,
+    items: &[FunctionCallOutputContentItem],
+) -> Option<FunctionToolOutput> {
+    let supports_audio = turn
+        .model_info
+        .input_modalities
+        .contains(&InputModality::Audio);
+    if supports_audio
+        || !items
+            .iter()
+            .any(|item| matches!(item, FunctionCallOutputContentItem::InputAudio { .. }))
+    {
+        return None;
+    }
+
+    Some(FunctionToolOutput::from_text(
+        "audio content emitted by code mode but the selected model does not support audio input"
+            .to_string(),
+        Some(false),
+    ))
+}
+
 fn sanitize_runtime_image_detail(turn: &TurnContext, items: &mut [FunctionCallOutputContentItem]) {
    sanitize_image_detail_items(can_request_original_image_detail(&turn.model_info), items);
 }
--- a/codex-rs/core/src/tools/code_mode/response_adapter.rs
+++ b/codex-rs/core/src/tools/code_mode/response_adapter.rs
@@ -40,6 +40,14 @@ impl IntoProtocol<FunctionCallOutputContentItem>
                        .or(Some(DEFAULT_IMAGE_DETAIL)),
                }
            }
+            codex_code_mode::FunctionCallOutputContentItem::InputAudio { input_audio } => {
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: codex_protocol::models::InputAudio {
+                        data: input_audio.data,
+                        format: input_audio.format,
+                    },
+                }
+            }
        }
    }
 }
--- a/codex-rs/core/src/tools/spec_plan.rs
+++ b/codex-rs/core/src/tools/spec_plan.rs
@@ -253,8 +253,11 @@ fn build_code_mode_executors(
            create_code_mode_tool(
                &enabled_tools,
                &namespace_descriptions,
-                config.code_mode_only_enabled,
-                deferred_tools_available,
+                codex_code_mode::ExecToolDescriptionOptions {
+                    code_mode_only: config.code_mode_only_enabled,
+                    deferred_tools_available,
+                    supports_audio_input: config.supports_audio_input,
+                },
            ),
            code_mode_nested_tool_specs,
        )),
--- a/codex-rs/core/src/tools/spec_plan_tests.rs
+++ b/codex-rs/core/src/tools/spec_plan_tests.rs
@@ -2352,6 +2352,70 @@ fn code_mode_exec_description_omits_nested_tool_details_when_not_code_mode_only(
    assert!(!description.contains("### `view_image`"));
 }

+#[test]
+fn code_mode_exec_audio_helper_docs_require_audio_input_support() {
+    let unsupported_model_info = model_info();
+    let mut supported_model_info = unsupported_model_info.clone();
+    supported_model_info.input_modalities = vec![
+        InputModality::Text,
+        InputModality::Image,
+        InputModality::Audio,
+    ];
+    let mut features = Features::with_defaults();
+    features.enable(Feature::CodeMode);
+    let available_models = Vec::new();
+    let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &unsupported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+    let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &supported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+
+    let (unsupported_tools, _) = build_specs(
+        &unsupported_tools_config,
+        /*mcp_tools*/ None,
+        /*deferred_mcp_tools*/ None,
+        &[],
+    );
+    let ToolSpec::Freeform(FreeformTool {
+        description: unsupported_description,
+        ..
+    }) = find_tool(&unsupported_tools, "exec")
+    else {
+        panic!("expected freeform tool");
+    };
+    assert!(!unsupported_description.contains("`audio(audioItem"));
+
+    let (supported_tools, _) = build_specs(
+        &supported_tools_config,
+        /*mcp_tools*/ None,
+        /*deferred_mcp_tools*/ None,
+        &[],
+    );
+    let ToolSpec::Freeform(FreeformTool {
+        description: supported_description,
+        ..
+    }) = find_tool(&supported_tools, "exec")
+    else {
+        panic!("expected freeform tool");
+    };
+    assert!(supported_description.contains("`audio(audioItem"));
+}
+
 fn model_info() -> ModelInfo {
    serde_json::from_value(json!({
        "slug": "gpt-5-codex",
--- a/codex-rs/core/tests/suite/code_mode.rs
+++ b/codex-rs/core/tests/suite/code_mode.rs
@@ -12,6 +12,7 @@ use codex_protocol::dynamic_tools::DynamicToolCallOutputContentItem;
 use codex_protocol::dynamic_tools::DynamicToolResponse;
 use codex_protocol::dynamic_tools::DynamicToolSpec;
 use codex_protocol::models::PermissionProfile;
+use codex_protocol::openai_models::InputModality;
 use codex_protocol::protocol::AskForApproval;
 use codex_protocol::protocol::EventMsg;
 use codex_protocol::protocol::Op;
@@ -175,6 +176,54 @@ async fn run_code_mode_turn(
    Ok((test, second_mock))
 }

+async fn run_code_mode_turn_with_audio_model(
+    server: &MockServer,
+    prompt: &str,
+    code: &str,
+) -> Result<(TestCodex, ResponseMock)> {
+    let mut builder = test_codex()
+        .with_model("gpt-5.4")
+        .with_config(move |config| {
+            let _ = config.features.enable(Feature::CodeMode);
+            let mut model_catalog = bundled_models_response()
+                .unwrap_or_else(|err| panic!("bundled models.json should parse: {err}"));
+            let model = model_catalog
+                .models
+                .iter_mut()
+                .find(|model| model.slug == "gpt-5.4")
+                .expect("gpt-5.4 exists in bundled models.json");
+            model.input_modalities = vec![
+                InputModality::Text,
+                InputModality::Image,
+                InputModality::Audio,
+            ];
+            config.model_catalog = Some(model_catalog);
+        });
+    let test = builder.build(server).await?;
+
+    responses::mount_sse_once(
+        server,
+        sse(vec![
+            ev_response_created("resp-1"),
+            ev_custom_tool_call("call-1", "exec", code),
+            ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+
+    let second_mock = responses::mount_sse_once(
+        server,
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    test.submit_turn(prompt).await?;
+    Ok((test, second_mock))
+}
+
 async fn run_code_mode_turn_with_rmcp(
    server: &MockServer,
    prompt: &str,
@@ -1974,6 +2023,77 @@ image("data:image/png;base64,AAA");
    Ok(())
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_can_output_audio_via_global_helper_for_audio_model() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn_with_audio_model(
+        &server,
+        "use exec to return audio",
+        r#"
+audio({ data: "BASE64", format: "wav" });
+audio({ data: "data:audio/mpeg;base64,MP3BASE64" });
+"#,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let items = custom_tool_output_items(&req, "call-1");
+    let (_, success) = custom_tool_output_body_and_success(&req, "call-1");
+    assert_ne!(
+        success,
+        Some(false),
+        "code_mode audio output failed unexpectedly"
+    );
+    assert_eq!(items.len(), 3);
+    assert_regex_match(
+        concat!(
+            r"(?s)\A",
+            r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
+        ),
+        text_item(&items, /*index*/ 0),
+    );
+    assert_eq!(
+        items[1],
+        serde_json::json!({
+            "type": "input_audio",
+            "input_audio": { "data": "BASE64", "format": "wav" }
+        }),
+    );
+    assert_eq!(
+        items[2],
+        serde_json::json!({
+            "type": "input_audio",
+            "input_audio": { "data": "MP3BASE64", "format": "mp3" }
+        }),
+    );
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_audio_output_fails_for_non_audio_model() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use exec to return audio",
+        r#"audio({ data: "BASE64", format: "wav" });"#,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let (output, _success) = custom_tool_output_body_and_success(&req, "call-1");
+    assert_eq!(
+        output,
+        "audio content emitted by code mode but the selected model does not support audio input"
+    );
+
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> {
    skip_if_no_network!(Ok(()));
@@ -2404,6 +2524,7 @@ text(JSON.stringify(Object.getOwnPropertyNames(globalThis).sort()));
        "WeakSet",
        "__codexContentItems",
        "add_content",
+        "audio",
        "decodeURI",
        "decodeURIComponent",
        "encodeURI",
--- a/codex-rs/core/tests/suite/rmcp_client.rs
+++ b/codex-rs/core/tests/suite/rmcp_client.rs
@@ -93,6 +93,50 @@ fn assert_wall_time_header(output: &str) {
    assert_eq!(marker, "Output:");
 }

+fn test_model_info_with_modalities(
+    slug: &str,
+    description: &str,
+    input_modalities: Vec<InputModality>,
+) -> ModelInfo {
+    ModelInfo {
+        slug: slug.to_string(),
+        display_name: slug.to_string(),
+        description: Some(description.to_string()),
+        default_reasoning_level: None,
+        supported_reasoning_levels: vec![ReasoningEffortPreset {
+            effort: codex_protocol::openai_models::ReasoningEffort::Medium,
+            description: "Medium".to_string(),
+        }],
+        shell_type: ConfigShellToolType::Default,
+        visibility: ModelVisibility::List,
+        supported_in_api: true,
+        priority: 1,
+        additional_speed_tiers: Vec::new(),
+        service_tiers: Vec::new(),
+        upgrade: None,
+        base_instructions: "base instructions".to_string(),
+        model_messages: None,
+        supports_reasoning_summaries: false,
+        default_reasoning_summary: ReasoningSummary::Auto,
+        support_verbosity: false,
+        default_verbosity: None,
+        availability_nux: None,
+        apply_patch_tool_type: None,
+        web_search_tool_type: Default::default(),
+        truncation_policy: TruncationPolicyConfig::bytes(/*limit*/ 10_000),
+        supports_parallel_tool_calls: false,
+        supports_image_detail_original: false,
+        context_window: Some(272_000),
+        max_context_window: None,
+        auto_compact_token_limit: None,
+        effective_context_window_percent: 95,
+        experimental_supported_tools: Vec::new(),
+        input_modalities,
+        used_fallback_model_metadata: false,
+        supports_search_tool: false,
+    }
+}
+
 fn read_only_user_turn(fixture: &TestCodex, text: impl Into<String>) -> Op {
    read_only_user_turn_with_model(fixture, text, fixture.session_configured.model.clone())
 }
@@ -154,7 +198,7 @@ fn remote_aware_stdio_server_bin() -> anyhow::Result<String> {
        return Ok(bin);
    };

-    // Keep the Docker path rewrite scoped to tests that use `build_remote_aware`.
+    // Keep the Docker path rewrite scoped to tests that use `build_with_remote_env`.
    // Other MCP tests still start their stdio server from the orchestrator test
    // process, even when the full-ci remote env is present.
    //
@@ -1386,6 +1430,257 @@ async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Re
    Ok(())
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+#[serial(mcp_test_value)]
+async fn stdio_audio_responses_are_forwarded_for_audio_model() -> anyhow::Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+
+    let call_id = "audio-supported-1";
+    let server_name = "rmcp";
+    let namespace = format!("mcp__{server_name}__");
+    let audio_model_slug = "rmcp-audio-model";
+
+    let models_mock = mount_models_once(
+        &server,
+        ModelsResponse {
+            models: vec![test_model_info_with_modalities(
+                audio_model_slug,
+                "Test model with audio input support",
+                vec![
+                    InputModality::Text,
+                    InputModality::Image,
+                    InputModality::Audio,
+                ],
+            )],
+        },
+    )
+    .await;
+
+    mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let final_mock = mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_assistant_message("msg-1", "rmcp audio tool completed successfully."),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    let rmcp_test_server_bin = remote_aware_stdio_server_bin()?;
+
+    let fixture = test_codex()
+        .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
+        .with_config(move |config| {
+            insert_mcp_server(
+                config,
+                server_name,
+                stdio_transport(
+                    rmcp_test_server_bin,
+                    Some(HashMap::from([
+                        ("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()),
+                        (
+                            "MCP_TEST_AUDIO_MIME_TYPE".to_string(),
+                            "audio/mpeg".to_string(),
+                        ),
+                    ])),
+                    Vec::new(),
+                ),
+                TestMcpServerOptions {
+                    experimental_environment: remote_aware_experimental_environment(),
+                    ..Default::default()
+                },
+            );
+        })
+        .build_with_remote_env(&server)
+        .await?;
+
+    fixture
+        .thread_manager
+        .get_models_manager()
+        .list_models(RefreshStrategy::Online)
+        .await;
+    assert_eq!(models_mock.requests().len(), 1);
+
+    fixture
+        .codex
+        .submit(read_only_user_turn_with_model(
+            &fixture,
+            "call the rmcp audio tool",
+            audio_model_slug.to_string(),
+        ))
+        .await?;
+
+    wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallBegin(_))
+    })
+    .await;
+    wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallEnd(_))
+    })
+    .await;
+    wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
+
+    let output_item = final_mock.single_request().function_call_output(call_id);
+    let output = output_item["output"]
+        .as_array()
+        .expect("audio MCP output should be content items");
+    assert_eq!(output.len(), 2);
+    assert_wall_time_header(
+        output[0]["text"]
+            .as_str()
+            .expect("first MCP audio output item should be wall-time text"),
+    );
+    assert_eq!(
+        output[1],
+        json!({
+            "type": "input_audio",
+            "input_audio": {
+                "data": "UklGRg==",
+                "format": "mp3",
+            },
+        })
+    );
+
+    server.verify().await;
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+#[serial(mcp_test_value)]
+async fn stdio_audio_responses_fail_for_text_only_model() -> anyhow::Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+
+    let call_id = "audio-text-only-1";
+    let server_name = "rmcp";
+    let namespace = format!("mcp__{server_name}__");
+    let text_only_model_slug = "rmcp-audio-text-only-model";
+
+    let models_mock = mount_models_once(
+        &server,
+        ModelsResponse {
+            models: vec![test_model_info_with_modalities(
+                text_only_model_slug,
+                "Test model without audio input support",
+                vec![InputModality::Text, InputModality::Image],
+            )],
+        },
+    )
+    .await;
+
+    mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let final_mock = mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_assistant_message("msg-1", "rmcp audio tool failed."),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    let rmcp_test_server_bin = remote_aware_stdio_server_bin()?;
+
+    let fixture = test_codex()
+        .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
+        .with_config(move |config| {
+            insert_mcp_server(
+                config,
+                server_name,
+                stdio_transport(
+                    rmcp_test_server_bin,
+                    Some(HashMap::from([
+                        ("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()),
+                        (
+                            "MCP_TEST_AUDIO_MIME_TYPE".to_string(),
+                            "audio/wav".to_string(),
+                        ),
+                    ])),
+                    Vec::new(),
+                ),
+                TestMcpServerOptions {
+                    experimental_environment: remote_aware_experimental_environment(),
+                    ..Default::default()
+                },
+            );
+        })
+        .build_with_remote_env(&server)
+        .await?;
+
+    fixture
+        .thread_manager
+        .get_models_manager()
+        .list_models(RefreshStrategy::Online)
+        .await;
+    assert_eq!(models_mock.requests().len(), 1);
+
+    fixture
+        .codex
+        .submit(read_only_user_turn_with_model(
+            &fixture,
+            "call the rmcp audio tool",
+            text_only_model_slug.to_string(),
+        ))
+        .await?;
+
+    wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallBegin(_))
+    })
+    .await;
+    let end_event = wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallEnd(_))
+    })
+    .await;
+    let EventMsg::McpToolCallEnd(end) = end_event else {
+        unreachable!("event guard guarantees McpToolCallEnd");
+    };
+    assert_eq!(
+        end.result,
+        Err(
+            "audio content returned by MCP tool but the selected model does not support audio input"
+                .to_string()
+        )
+    );
+    wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
+
+    let output_item = final_mock.single_request().function_call_output(call_id);
+    let output_text = output_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("function_call_output output should be a JSON string");
+    let wrapped_payload = split_wall_time_wrapped_output(output_text);
+    let output_json: Value = serde_json::from_str(wrapped_payload)
+        .expect("function_call_output output should be valid JSON");
+    assert_eq!(
+        output_json,
+        json!([{
+            "type": "text",
+            "text": "audio content returned by MCP tool but the selected model does not support audio input"
+        }])
+    );
+
+    server.verify().await;
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
 #[serial(mcp_test_value)]
 async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {
--- a/codex-rs/protocol/src/models.rs
+++ b/codex-rs/protocol/src/models.rs
@@ -1314,6 +1314,98 @@ pub enum FunctionCallOutputContentItem {
        #[ts(optional)]
        detail: Option<ImageDetail>,
    },
+    // Do not rename, these are serialized and used directly in the responses API.
+    InputAudio {
+        input_audio: InputAudio,
+    },
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)]
+pub struct InputAudio {
+    pub data: String,
+    pub format: String,
+}
+
+pub fn input_audio_from_data(
+    data: &str,
+    format: Option<&str>,
+    mime_type: Option<&str>,
+) -> Option<InputAudio> {
+    if data.is_empty() {
+        return None;
+    }
+
+    let (data, data_url_format) = if let Some((data, format)) = parse_audio_data_url(data) {
+        (data, Some(format))
+    } else if data
+        .get(.."data:".len())
+        .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
+    {
+        return None;
+    } else {
+        (data.to_string(), None)
+    };
+    if data.is_empty() {
+        return None;
+    }
+
+    let mime_type_format = match mime_type {
+        Some(mime_type) => Some(audio_format_from_mime_type(mime_type)?),
+        None => None,
+    };
+
+    let format = format
+        .and_then(normalize_audio_format)
+        .or(data_url_format)
+        .or(mime_type_format)?;
+
+    Some(InputAudio { data, format })
+}
+
+fn parse_audio_data_url(data_url: &str) -> Option<(String, String)> {
+    if data_url.len() < "data:".len()
+        || !data_url
+            .get(.."data:".len())
+            .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
+    {
+        return None;
+    }
+
+    let (metadata, data) = data_url["data:".len()..].split_once(',')?;
+    if !metadata
+        .split(';')
+        .any(|part| part.eq_ignore_ascii_case("base64"))
+    {
+        return None;
+    }
+
+    let mime_type = metadata.split(';').next()?;
+    let format = audio_format_from_mime_type(mime_type)?;
+    Some((data.to_string(), format))
+}
+
+fn audio_format_from_mime_type(mime_type: &str) -> Option<String> {
+    let media_type = mime_type.split(';').next()?.trim().to_ascii_lowercase();
+    let subtype = media_type.strip_prefix("audio/")?;
+    normalize_audio_format(subtype)
+}
+
+fn normalize_audio_format(format: &str) -> Option<String> {
+    let format = format.trim().to_ascii_lowercase();
+    if format.is_empty() {
+        return None;
+    }
+    if format.contains('/') {
+        return audio_format_from_mime_type(&format);
+    }
+
+    let format = format.strip_prefix("x-").unwrap_or(&format);
+    let format = match format {
+        "mpeg" => "mp3",
+        "wave" => "wav",
+        _ => format,
+    };
+    Some(format.to_string())
 }

 /// Converts structured function-call output content into plain text for
@@ -1321,7 +1413,7 @@ pub enum FunctionCallOutputContentItem {
 ///
 /// This conversion is intentionally lossy:
 /// - only `input_text` items are included
-/// - image items are ignored
+/// - image and audio items are ignored
 ///
 /// We use this helper where callers still need a string representation (for
 /// example telemetry previews or legacy string-only output paths) while keeping
@@ -1337,7 +1429,8 @@ pub fn function_call_output_content_items_to_text(
                Some(text.as_str())
            }
            FunctionCallOutputContentItem::InputText { .. }
-            | FunctionCallOutputContentItem::InputImage { .. } => None,
+            | FunctionCallOutputContentItem::InputImage { .. }
+            | FunctionCallOutputContentItem::InputAudio { .. } => None,
        })
        .collect::<Vec<_>>();

@@ -1388,7 +1481,7 @@ impl FunctionCallOutputBody {
    /// human-readable surfaces.
    ///
    /// This conversion is intentionally lossy when the body contains content
-    /// items: image entries are dropped and text entries are joined with
+    /// items: image and audio entries are dropped and text entries are joined with
    /// newlines.
    pub fn to_text(&self) -> Option<String> {
        match self {
@@ -1566,11 +1659,18 @@ fn convert_mcp_content_to_items(
            #[serde(rename = "_meta", default)]
            meta: Option<serde_json::Value>,
        },
+        #[serde(rename = "audio")]
+        Audio {
+            data: String,
+            #[serde(rename = "mimeType", alias = "mime_type")]
+            mime_type: Option<String>,
+        },
        #[serde(other)]
        Unknown,
    }

    let mut saw_image = false;
+    let mut saw_audio = false;
    let mut items = Vec::with_capacity(contents.len());

    for content in contents {
@@ -1603,6 +1703,19 @@ fn convert_mcp_content_to_items(
                        .or(Some(DEFAULT_IMAGE_DETAIL)),
                }
            }
+            Ok(McpContent::Audio { data, mime_type }) => {
+                if let Some(input_audio) =
+                    input_audio_from_data(&data, /*format*/ None, mime_type.as_deref())
+                {
+                    saw_audio = true;
+                    FunctionCallOutputContentItem::InputAudio { input_audio }
+                } else {
+                    FunctionCallOutputContentItem::InputText {
+                        text: serde_json::to_string(content)
+                            .unwrap_or_else(|_| "<content>".to_string()),
+                    }
+                }
+            }
            Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText {
                text: serde_json::to_string(content).unwrap_or_else(|_| "<content>".to_string()),
            },
@@ -1610,7 +1723,11 @@ fn convert_mcp_content_to_items(
        items.push(item);
    }

-    if saw_image { Some(items) } else { None }
+    if saw_image || saw_audio {
+        Some(items)
+    } else {
+        None
+    }
 }

 // Implement Display so callers can treat the payload like a plain string when logging or doing
@@ -2248,6 +2365,198 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn serializes_audio_outputs_as_array() -> Result<()> {
+        let call_tool_result = CallToolResult {
+            content: vec![
+                serde_json::json!({"type":"text","text":"caption"}),
+                serde_json::json!({"type":"audio","data":"BASE64","mimeType":"audio/mpeg"}),
+            ],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        assert_eq!(payload.success, Some(true));
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        let items = items.to_vec();
+        assert_eq!(
+            items,
+            vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "caption".into(),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".into(),
+                        format: "mp3".into(),
+                    },
+                },
+            ]
+        );
+
+        let item = ResponseInputItem::FunctionCallOutput {
+            call_id: "call1".into(),
+            output: payload,
+        };
+
+        let json = serde_json::to_string(&item)?;
+        let v: serde_json::Value = serde_json::from_str(&json)?;
+
+        assert_eq!(
+            v.get("output").expect("output field"),
+            &serde_json::json!([
+                { "type": "input_text", "text": "caption" },
+                { "type": "input_audio", "input_audio": { "data": "BASE64", "format": "mp3" } }
+            ])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn serializes_mixed_image_and_audio_outputs_as_array() {
+        let call_tool_result = CallToolResult {
+            content: vec![
+                serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}),
+                serde_json::json!({"type":"audio","data":"AUDIO","mimeType":"audio/wav"}),
+            ],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        assert_eq!(
+            items,
+            [
+                FunctionCallOutputContentItem::InputImage {
+                    image_url: "data:image/png;base64,IMAGE".into(),
+                    detail: Some(DEFAULT_IMAGE_DETAIL),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "AUDIO".into(),
+                        format: "wav".into(),
+                    },
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn strips_audio_data_urls_and_derives_format() {
+        let call_tool_result = CallToolResult {
+            content: vec![serde_json::json!({
+                "type": "audio",
+                "data": "data:audio/ogg;base64,T2dnUw",
+            })],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        assert_eq!(
+            items,
+            [FunctionCallOutputContentItem::InputAudio {
+                input_audio: InputAudio {
+                    data: "T2dnUw".into(),
+                    format: "ogg".into(),
+                },
+            }]
+        );
+    }
+
+    #[test]
+    fn audio_without_derivable_format_falls_back_to_text_payload() {
+        let content = vec![serde_json::json!({
+            "type": "audio",
+            "data": "BASE64",
+        })];
+        let call_tool_result = CallToolResult {
+            content: content.clone(),
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        assert_eq!(
+            payload,
+            FunctionCallOutputPayload {
+                body: FunctionCallOutputBody::Text(serde_json::to_string(&content).unwrap()),
+                success: Some(true),
+            }
+        );
+    }
+
+    #[test]
+    fn malformed_audio_block_falls_back_to_text_inside_structured_payload() {
+        let malformed_audio = serde_json::json!({
+            "type": "audio",
+            "data": "data:image/png;base64,NOT_AUDIO",
+        });
+        let call_tool_result = CallToolResult {
+            content: vec![
+                serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}),
+                malformed_audio.clone(),
+            ],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        assert_eq!(
+            items,
+            [
+                FunctionCallOutputContentItem::InputImage {
+                    image_url: "data:image/png;base64,IMAGE".into(),
+                    detail: Some(DEFAULT_IMAGE_DETAIL),
+                },
+                FunctionCallOutputContentItem::InputText {
+                    text: serde_json::to_string(&malformed_audio).unwrap(),
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn structured_content_precedence_ignores_audio_content() {
+        let call_tool_result = CallToolResult {
+            content: vec![serde_json::json!({
+                "type": "audio",
+                "data": "BASE64",
+                "mimeType": "audio/wav",
+            })],
+            structured_content: Some(serde_json::json!({ "ok": true })),
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        assert_eq!(
+            payload,
+            FunctionCallOutputPayload {
+                body: FunctionCallOutputBody::Text("{\"ok\":true}".to_string()),
+                success: Some(true),
+            }
+        );
+    }
+
    #[test]
    fn serializes_custom_tool_image_outputs_as_array() -> Result<()> {
        let item = ResponseInputItem::CustomToolCallOutput {
--- a/codex-rs/protocol/src/openai_models.rs
+++ b/codex-rs/protocol/src/openai_models.rs
@@ -82,6 +82,8 @@ pub enum InputModality {
    Text,
    /// Image attachments included in user turns.
    Image,
+    /// Audio content included in tool payloads.
+    Audio,
 }

 /// Backward-compatible default when `input_modalities` is omitted on the wire.
--- a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
+++ b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
@@ -71,6 +71,7 @@ impl TestToolServer {
            Self::cwd_tool(),
            Self::sync_tool(),
            Self::image_tool(),
+            Self::audio_tool(),
            Self::image_scenario_tool(),
            sandbox_meta_tool,
        ];
@@ -227,6 +228,24 @@ impl TestToolServer {
        tool
    }

+    fn audio_tool() -> Tool {
+        #[expect(clippy::expect_used)]
+        let schema: JsonObject = serde_json::from_value(serde_json::json!({
+            "type": "object",
+            "properties": {},
+            "additionalProperties": false
+        }))
+        .expect("audio tool schema should deserialize");
+
+        let mut tool = Tool::new(
+            Cow::Borrowed("audio"),
+            Cow::Borrowed("Return a single audio content block."),
+            Arc::new(schema),
+        );
+        tool.annotations = Some(ToolAnnotations::new().read_only(true));
+        tool
+    }
+
    /// Tool intended for manual testing of Codex TUI rendering for MCP image tool results.
    ///
    /// This exists to exercise edge cases where a `CallToolResult.content` includes image blocks
@@ -543,6 +562,20 @@ impl ServerHandler for TestToolServer {
                    data_b64, mime_type,
                )]))
            }
+            "audio" => {
+                let data =
+                    std::env::var("MCP_TEST_AUDIO_DATA").unwrap_or_else(|_| "QkFTRTY0".to_string());
+                let mime_type = std::env::var("MCP_TEST_AUDIO_MIME_TYPE")
+                    .unwrap_or_else(|_| "audio/wav".to_string());
+
+                Ok(CallToolResult::success(vec![rmcp::model::Annotated::new(
+                    rmcp::model::RawContent::Audio(rmcp::model::RawAudioContent {
+                        data,
+                        mime_type,
+                    }),
+                    None,
+                )]))
+            }
            "image_scenario" => {
                let args = Self::parse_call_args::<ImageScenarioArgs>(&request, "image_scenario")?;
                Self::image_scenario_result(args)
--- a/codex-rs/tools/src/tool_config.rs
+++ b/codex-rs/tools/src/tool_config.rs
@@ -113,6 +113,7 @@ pub struct ToolsConfig {
    pub request_permissions_tool_enabled: bool,
    pub code_mode_enabled: bool,
    pub code_mode_only_enabled: bool,
+    pub supports_audio_input: bool,
    pub can_request_original_image_detail: bool,
    pub collab_tools: bool,
    pub goal_tools: bool,
@@ -187,6 +188,7 @@ impl ToolsConfig {
            && features.enabled(Feature::Apps)
            && features.enabled(Feature::Plugins);
        let include_original_image_detail = can_request_original_image_detail(model_info);
+        let supports_audio_input = model_info.input_modalities.contains(&InputModality::Audio);
        // API-key auth bypasses Codex backend entitlement/tool normalization, so
        // callers must confirm ChatGPT auth before exposing the built-in tool.
        let include_image_gen_tool = *image_generation_tool_auth_allowed
@@ -252,6 +254,7 @@ impl ToolsConfig {
            request_permissions_tool_enabled,
            code_mode_enabled: include_code_mode,
            code_mode_only_enabled: include_code_mode_only,
+            supports_audio_input,
            can_request_original_image_detail: include_original_image_detail,
            collab_tools: include_collab_tools,
            goal_tools: include_goal_tools,
--- a/codex-rs/tools/src/tool_config_tests.rs
+++ b/codex-rs/tools/src/tool_config_tests.rs
@@ -261,6 +261,48 @@ fn image_generation_requires_feature_and_supported_model() {
    assert!(!unsupported_tools_config.image_gen_tool);
 }

+#[test]
+fn audio_input_support_tracks_model_modalities() {
+    let supported_model_info = ModelInfo {
+        input_modalities: vec![
+            InputModality::Text,
+            InputModality::Image,
+            InputModality::Audio,
+        ],
+        ..model_info()
+    };
+    let unsupported_model_info = ModelInfo {
+        input_modalities: vec![InputModality::Text, InputModality::Image],
+        ..model_info()
+    };
+    let features = Features::with_defaults();
+    let available_models = Vec::new();
+
+    let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &supported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+    let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &unsupported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+
+    assert!(supported_tools_config.supports_audio_input);
+    assert!(!unsupported_tools_config.supports_audio_input);
+}
+
 #[test]
 fn provider_capability_methods_disable_provider_bound_tool_surfaces() {
    let model_info = model_info();
--- a/codex-rs/tools/src/tool_output.rs
+++ b/codex-rs/tools/src/tool_output.rs
@@ -209,7 +209,8 @@ fn content_items_to_code_mode_result(items: &[FunctionCallOutputContentItem]) ->
                    Some(image_url.clone())
                }
                FunctionCallOutputContentItem::InputText { .. }
-                | FunctionCallOutputContentItem::InputImage { .. } => None,
+                | FunctionCallOutputContentItem::InputImage { .. }
+                | FunctionCallOutputContentItem::InputAudio { .. } => None,
            })
            .collect::<Vec<_>>()
            .join("\n"),
--- a/codex-rs/utils/output-truncation/src/lib.rs
+++ b/codex-rs/utils/output-truncation/src/lib.rs
@@ -1,6 +1,7 @@
 //! Helpers for truncating tool and exec output using [`TruncationPolicy`](codex_protocol::protocol::TruncationPolicy).

 use codex_protocol::models::FunctionCallOutputContentItem;
+use codex_protocol::models::InputAudio;
 pub use codex_utils_string::approx_bytes_for_tokens;
 pub use codex_utils_string::approx_token_count;
 pub use codex_utils_string::approx_tokens_from_byte_count;
@@ -9,6 +10,9 @@ use codex_utils_string::truncate_middle_with_token_budget;

 pub use codex_protocol::protocol::TruncationPolicy;

+const INPUT_AUDIO_JSON_OVERHEAD_BYTES: usize =
+    r#"{"type":"input_audio","input_audio":{"data":"","format":""}}"#.len();
+
 pub fn formatted_truncate_text(content: &str, policy: TruncationPolicy) -> String {
    if content.len() <= policy.byte_budget() {
        return content.to_string();
@@ -34,12 +38,16 @@ pub fn formatted_truncate_text_content_items_with_policy(
        .iter()
        .filter_map(|item| match item {
            FunctionCallOutputContentItem::InputText { text } => Some(text.as_str()),
-            FunctionCallOutputContentItem::InputImage { .. } => None,
+            FunctionCallOutputContentItem::InputImage { .. }
+            | FunctionCallOutputContentItem::InputAudio { .. } => None,
        })
        .collect::<Vec<_>>();

    if text_segments.is_empty() {
-        return (items.to_vec(), None);
+        return (
+            truncate_function_output_items_with_policy(items, policy),
+            None,
+        );
    }

    let mut combined = String::new();
@@ -50,22 +58,59 @@ pub fn formatted_truncate_text_content_items_with_policy(
        combined.push_str(text);
    }

-    if combined.len() <= policy.byte_budget() {
-        return (items.to_vec(), None);
+    let combined_cost = serialized_byte_cost_for_policy(combined.len(), policy);
+    let budget = budget_for_policy(policy);
+    if combined_cost <= budget {
+        let mut remaining_budget = budget.saturating_sub(combined_cost);
+        let mut out: Vec<FunctionCallOutputContentItem> = Vec::with_capacity(items.len());
+        let mut omitted_audio_items = 0usize;
+
+        for item in items {
+            match item {
+                FunctionCallOutputContentItem::InputText { text } => {
+                    out.push(FunctionCallOutputContentItem::InputText { text: text.clone() });
+                }
+                FunctionCallOutputContentItem::InputImage { image_url, detail } => {
+                    out.push(FunctionCallOutputContentItem::InputImage {
+                        image_url: image_url.clone(),
+                        detail: *detail,
+                    });
+                }
+                FunctionCallOutputContentItem::InputAudio { input_audio } => {
+                    push_audio_item_with_budget(
+                        &mut out,
+                        input_audio,
+                        policy,
+                        &mut remaining_budget,
+                        &mut omitted_audio_items,
+                    );
+                }
+            }
+        }
+
+        push_omitted_audio_summary(&mut out, omitted_audio_items);
+        return (out, None);
    }

    let mut out = vec![FunctionCallOutputContentItem::InputText {
        text: formatted_truncate_text(&combined, policy),
    }];
-    out.extend(items.iter().filter_map(|item| match item {
-        FunctionCallOutputContentItem::InputImage { image_url, detail } => {
-            Some(FunctionCallOutputContentItem::InputImage {
-                image_url: image_url.clone(),
-                detail: *detail,
-            })
+    let mut omitted_audio_items = 0usize;
+    for item in items {
+        match item {
+            FunctionCallOutputContentItem::InputImage { image_url, detail } => {
+                out.push(FunctionCallOutputContentItem::InputImage {
+                    image_url: image_url.clone(),
+                    detail: *detail,
+                });
+            }
+            FunctionCallOutputContentItem::InputAudio { .. } => {
+                omitted_audio_items += 1;
+            }
+            FunctionCallOutputContentItem::InputText { .. } => {}
        }
-        FunctionCallOutputContentItem::InputText { .. } => None,
-    }));
+    }
+    push_omitted_audio_summary(&mut out, omitted_audio_items);

    (out, Some(approx_token_count(&combined)))
 }
@@ -75,11 +120,9 @@ pub fn truncate_function_output_items_with_policy(
    policy: TruncationPolicy,
 ) -> Vec<FunctionCallOutputContentItem> {
    let mut out: Vec<FunctionCallOutputContentItem> = Vec::with_capacity(items.len());
-    let mut remaining_budget = match policy {
-        TruncationPolicy::Bytes(_) => policy.byte_budget(),
-        TruncationPolicy::Tokens(_) => policy.token_budget(),
-    };
+    let mut remaining_budget = budget_for_policy(policy);
    let mut omitted_text_items = 0usize;
+    let mut omitted_audio_items = 0usize;

    for item in items {
        match item {
@@ -89,10 +132,7 @@ pub fn truncate_function_output_items_with_policy(
                    continue;
                }

-                let cost = match policy {
-                    TruncationPolicy::Bytes(_) => text.len(),
-                    TruncationPolicy::Tokens(_) => approx_token_count(text),
-                };
+                let cost = serialized_byte_cost_for_policy(text.len(), policy);

                if cost <= remaining_budget {
                    out.push(FunctionCallOutputContentItem::InputText { text: text.clone() });
@@ -117,6 +157,15 @@ pub fn truncate_function_output_items_with_policy(
                    detail: *detail,
                });
            }
+            FunctionCallOutputContentItem::InputAudio { input_audio } => {
+                push_audio_item_with_budget(
+                    &mut out,
+                    input_audio,
+                    policy,
+                    &mut remaining_budget,
+                    &mut omitted_audio_items,
+                );
+            }
        }
    }

@@ -125,10 +174,72 @@ pub fn truncate_function_output_items_with_policy(
            text: format!("[omitted {omitted_text_items} text items ...]"),
        });
    }
+    push_omitted_audio_summary(&mut out, omitted_audio_items);

    out
 }

+fn budget_for_policy(policy: TruncationPolicy) -> usize {
+    match policy {
+        TruncationPolicy::Bytes(_) => policy.byte_budget(),
+        TruncationPolicy::Tokens(_) => policy.token_budget(),
+    }
+}
+
+fn serialized_byte_cost_for_policy(byte_count: usize, policy: TruncationPolicy) -> usize {
+    match policy {
+        TruncationPolicy::Bytes(_) => byte_count,
+        TruncationPolicy::Tokens(_) => {
+            usize::try_from(approx_tokens_from_byte_count(byte_count)).unwrap_or(usize::MAX)
+        }
+    }
+}
+
+fn push_audio_item_with_budget(
+    out: &mut Vec<FunctionCallOutputContentItem>,
+    input_audio: &InputAudio,
+    policy: TruncationPolicy,
+    remaining_budget: &mut usize,
+    omitted_audio_items: &mut usize,
+) {
+    // Preserve audio only when the payload fits the remaining output budget.
+    let byte_count = INPUT_AUDIO_JSON_OVERHEAD_BYTES
+        .saturating_add(input_audio.data.len())
+        .saturating_add(input_audio.format.len());
+    let cost = serialized_byte_cost_for_policy(byte_count, policy);
+    if cost <= *remaining_budget {
+        out.push(FunctionCallOutputContentItem::InputAudio {
+            input_audio: input_audio.clone(),
+        });
+        *remaining_budget = remaining_budget.saturating_sub(cost);
+    } else {
+        *omitted_audio_items += 1;
+    }
+}
+
+fn push_omitted_audio_summary(
+    out: &mut Vec<FunctionCallOutputContentItem>,
+    omitted_audio_items: usize,
+) {
+    if omitted_audio_items > 0 {
+        let item_word = if omitted_audio_items == 1 {
+            "item"
+        } else {
+            "items"
+        };
+        let owner = if omitted_audio_items == 1 {
+            "its"
+        } else {
+            "their"
+        };
+        out.push(FunctionCallOutputContentItem::InputText {
+            text: format!(
+                "[omitted {omitted_audio_items} audio {item_word} because {owner} size exceeds the output truncation budget]"
+            ),
+        });
+    }
+}
+
 pub fn approx_tokens_from_byte_count_i64(bytes: i64) -> i64 {
    if bytes <= 0 {
        return 0;
--- a/codex-rs/utils/output-truncation/src/truncate_tests.rs
+++ b/codex-rs/utils/output-truncation/src/truncate_tests.rs
@@ -7,8 +7,11 @@ use crate::truncate_function_output_items_with_policy;
 use crate::truncate_text;
 use codex_protocol::models::DEFAULT_IMAGE_DETAIL;
 use codex_protocol::models::FunctionCallOutputContentItem;
+use codex_protocol::models::InputAudio;
 use pretty_assertions::assert_eq;

+const SMALL_AUDIO_SERIALIZED_BYTES: usize = 71;
+
 #[test]
 fn truncate_bytes_less_than_placeholder_returns_placeholder() {
    let content = "example output";
@@ -251,6 +254,141 @@ fn formatted_truncate_text_content_items_with_policy_merges_text_and_appends_ima
    assert_eq!(original_token_count, Some(4));
 }

+#[test]
+fn formatted_truncate_text_content_items_with_policy_preserves_audio_when_budget_allows() {
+    let items = vec![
+        FunctionCallOutputContentItem::InputText {
+            text: "abcd".to_string(),
+        },
+        FunctionCallOutputContentItem::InputAudio {
+            input_audio: InputAudio {
+                data: "UklGRg==".to_string(),
+                format: "wav".to_string(),
+            },
+        },
+        FunctionCallOutputContentItem::InputText {
+            text: "efgh".to_string(),
+        },
+    ];
+
+    let (output, original_token_count) = formatted_truncate_text_content_items_with_policy(
+        &items,
+        TruncationPolicy::Bytes(SMALL_AUDIO_SERIALIZED_BYTES + "abcd\nefgh".len()),
+    );
+
+    assert_eq!(output, items);
+    assert_eq!(original_token_count, None);
+}
+
+#[test]
+fn formatted_truncate_text_content_items_with_policy_omits_audio_when_budget_is_spent() {
+    let items = vec![
+        FunctionCallOutputContentItem::InputText {
+            text: "abcd".to_string(),
+        },
+        FunctionCallOutputContentItem::InputAudio {
+            input_audio: InputAudio {
+                data: "UklGRg==".to_string(),
+                format: "wav".to_string(),
+            },
+        },
+        FunctionCallOutputContentItem::InputText {
+            text: "efgh".to_string(),
+        },
+    ];
+
+    let (output, original_token_count) =
+        formatted_truncate_text_content_items_with_policy(&items, TruncationPolicy::Bytes(4));
+
+    assert_eq!(
+        output,
+        vec![
+            FunctionCallOutputContentItem::InputText {
+                text: "Total output lines: 2\n\nab…5 chars truncated…gh".to_string(),
+            },
+            FunctionCallOutputContentItem::InputText {
+                text:
+                    "[omitted 1 audio item because its size exceeds the output truncation budget]"
+                        .to_string(),
+            },
+        ]
+    );
+    assert_eq!(original_token_count, Some(3));
+}
+
+#[test]
+fn formatted_truncate_text_content_items_with_policy_omits_audio_only_over_budget() {
+    let items = vec![FunctionCallOutputContentItem::InputAudio {
+        input_audio: InputAudio {
+            data: "A".repeat(200),
+            format: "wav".to_string(),
+        },
+    }];
+
+    let (output, original_token_count) =
+        formatted_truncate_text_content_items_with_policy(&items, TruncationPolicy::Bytes(32));
+
+    assert_eq!(
+        output,
+        vec![FunctionCallOutputContentItem::InputText {
+            text: "[omitted 1 audio item because its size exceeds the output truncation budget]"
+                .to_string(),
+        }]
+    );
+    assert_eq!(original_token_count, None);
+}
+
+#[test]
+fn truncate_function_output_items_with_policy_omits_audio_over_budget() {
+    let items = vec![FunctionCallOutputContentItem::InputAudio {
+        input_audio: InputAudio {
+            data: "A".repeat(200),
+            format: "wav".to_string(),
+        },
+    }];
+
+    let output = truncate_function_output_items_with_policy(&items, TruncationPolicy::Bytes(32));
+
+    assert_eq!(
+        output,
+        vec![FunctionCallOutputContentItem::InputText {
+            text: "[omitted 1 audio item because its size exceeds the output truncation budget]"
+                .to_string(),
+        }]
+    );
+}
+
+#[test]
+fn truncate_function_output_items_with_policy_charges_preserved_audio_to_budget() {
+    let audio = FunctionCallOutputContentItem::InputAudio {
+        input_audio: InputAudio {
+            data: "UklGRg==".to_string(),
+            format: "wav".to_string(),
+        },
+    };
+    let items = vec![
+        audio.clone(),
+        FunctionCallOutputContentItem::InputText {
+            text: "tail".to_string(),
+        },
+    ];
+
+    let output = truncate_function_output_items_with_policy(
+        &items,
+        TruncationPolicy::Bytes(SMALL_AUDIO_SERIALIZED_BYTES),
+    );
+
+    assert_eq!(
+        output,
+        vec![
+            audio,
+            FunctionCallOutputContentItem::InputText {
+                text: "[omitted 1 text items ...]".to_string(),
+            },
+        ]
+    );
+}
+
 #[test]
 fn formatted_truncate_text_content_items_with_policy_merges_all_text_for_token_budget() {
    let items = vec![