Use parser-specific realtime voice enum (#14636)

Model realtime session output voices with an enum and map by parser so v1 uses fathom and v2 uses alloy. Co-authored-by: Codex <noreply@openai.com>
2026-04-29 08:56:38 +00:00 · 2026-03-13 16:17:13 -07:00
parent e9050e3e64
commit 7fa5201365
2 changed files with 19 additions and 9 deletions
--- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs
+++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs
@@ -77,7 +77,15 @@ pub(super) struct SessionAudioFormat {

 #[derive(Debug, Clone, Serialize)]
 pub(super) struct SessionAudioOutput {
-    pub(super) voice: String,
+    pub(super) voice: SessionAudioVoice,
+}
+
+#[derive(Debug, Clone, Copy, Serialize)]
+pub(super) enum SessionAudioVoice {
+    #[serde(rename = "fathom")]
+    Fathom,
+    #[serde(rename = "alloy")]
+    Alloy,
 }

 #[derive(Debug, Clone, Serialize)]