Add realtime voice selection (#17176)

- Add realtime voice selection for realtime/start.
- Expose the supported v1/v2 voice lists and cover explicit, configured,
default, and invalid voice paths.
This commit is contained in:
Ahmed Ibrahim
2026-04-08 20:19:15 -07:00
committed by GitHub
parent 4c2a1ae31b
commit 2f9090be62
36 changed files with 860 additions and 33 deletions

View File

@@ -226,6 +226,7 @@ mod tests {
use codex_client::Response;
use codex_client::StreamResponse;
use codex_client::TransportError;
use codex_protocol::protocol::RealtimeVoice;
use http::StatusCode;
use pretty_assertions::assert_eq;
use std::sync::Mutex;
@@ -308,6 +309,7 @@ mod tests {
session_id: Some(session_id.to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Marin,
}
}

View File

@@ -11,6 +11,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::parse_realtime_event;
use crate::error::ApiError;
use crate::provider::Provider;
@@ -306,9 +307,10 @@ impl RealtimeWebsocketWriter {
&self,
instructions: String,
session_mode: RealtimeSessionMode,
voice: RealtimeVoice,
) -> Result<(), ApiError> {
let session_mode = normalized_session_mode(self.event_parser, session_mode);
let session = session_update_session(self.event_parser, instructions, session_mode);
let session = session_update_session(self.event_parser, instructions, session_mode, voice);
self.send_json(&RealtimeOutboundMessage::SessionUpdate { session })
.await
}
@@ -577,7 +579,7 @@ impl RealtimeWebsocketClient {
);
connection
.writer
.send_session_update(config.instructions, config.session_mode)
.send_session_update(config.instructions, config.session_mode, config.voice)
.await?;
Ok(connection)
}
@@ -722,6 +724,7 @@ mod tests {
use codex_protocol::protocol::RealtimeHandoffRequested;
use codex_protocol::protocol::RealtimeInputAudioSpeechStarted;
use codex_protocol::protocol::RealtimeResponseCancelled;
use codex_protocol::protocol::RealtimeVoice;
use http::HeaderValue;
use pretty_assertions::assert_eq;
use serde_json::Value;
@@ -1238,7 +1241,7 @@ mod tests {
);
assert_eq!(
first_json["session"]["audio"]["output"]["voice"],
Value::String("fathom".to_string())
Value::String("breeze".to_string())
);
ws.send(Message::Text(
@@ -1371,6 +1374,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Breeze,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -1546,7 +1550,7 @@ mod tests {
);
assert_eq!(
first_json["session"]["audio"]["output"]["voice"],
Value::String("marin".to_string())
Value::String("cedar".to_string())
);
assert_eq!(
first_json["session"]["tools"][0]["type"],
@@ -1644,6 +1648,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Cedar,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -1748,6 +1753,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Transcription,
voice: RealtimeVoice::Marin,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -1811,7 +1817,7 @@ mod tests {
);
assert_eq!(
first_json["session"]["audio"]["output"]["voice"],
Value::String("fathom".to_string())
Value::String("cove".to_string())
);
assert!(first_json["session"].get("tools").is_none());
@@ -1850,6 +1856,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Transcription,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -1938,6 +1945,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
HeaderMap::new(),

View File

@@ -10,6 +10,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
use serde_json::Result as JsonResult;
use serde_json::Value;
@@ -56,11 +57,14 @@ pub(super) fn session_update_session(
event_parser: RealtimeEventParser,
instructions: String,
session_mode: RealtimeSessionMode,
voice: RealtimeVoice,
) -> SessionUpdateSession {
let session_mode = normalized_session_mode(event_parser, session_mode);
match event_parser {
RealtimeEventParser::V1 => v1_session_update_session(instructions),
RealtimeEventParser::RealtimeV2 => v2_session_update_session(instructions, session_mode),
RealtimeEventParser::V1 => v1_session_update_session(instructions, voice),
RealtimeEventParser::RealtimeV2 => {
v2_session_update_session(instructions, session_mode, voice)
}
}
}
@@ -69,6 +73,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult<
config.event_parser,
config.instructions,
config.session_mode,
config.voice,
);
session.id = config.session_id;
session.model = config.model;

View File

@@ -7,11 +7,11 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemType;
use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem;
use crate::endpoint::realtime_websocket::protocol::ConversationRole;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice;
use crate::endpoint::realtime_websocket::protocol::SessionType;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
@@ -38,7 +38,10 @@ pub(super) fn conversation_handoff_append_message(
}
}
pub(super) fn session_update_session(instructions: String) -> SessionUpdateSession {
pub(super) fn session_update_session(
instructions: String,
voice: RealtimeVoice,
) -> SessionUpdateSession {
SessionUpdateSession {
id: None,
r#type: SessionType::Quicksilver,
@@ -56,7 +59,7 @@ pub(super) fn session_update_session(instructions: String) -> SessionUpdateSessi
},
output: Some(SessionAudioOutput {
format: None,
voice: SessionAudioVoice::Fathom,
voice,
}),
},
tools: None,

View File

@@ -10,12 +10,12 @@ use crate::endpoint::realtime_websocket::protocol::ConversationRole;
use crate::endpoint::realtime_websocket::protocol::NoiseReductionType;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputFormat;
use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice;
use crate::endpoint::realtime_websocket::protocol::SessionFunctionTool;
use crate::endpoint::realtime_websocket::protocol::SessionNoiseReduction;
use crate::endpoint::realtime_websocket::protocol::SessionToolType;
@@ -59,6 +59,7 @@ pub(super) fn conversation_handoff_append_message(
pub(super) fn session_update_session(
instructions: String,
session_mode: RealtimeSessionMode,
voice: RealtimeVoice,
) -> SessionUpdateSession {
match session_mode {
RealtimeSessionMode::Conversational => SessionUpdateSession {
@@ -87,7 +88,7 @@ pub(super) fn session_update_session(
r#type: AudioFormatType::AudioPcm,
rate: REALTIME_AUDIO_SAMPLE_RATE,
}),
voice: SessionAudioVoice::Marin,
voice,
}),
},
tools: Some(vec![SessionFunctionTool {

View File

@@ -4,6 +4,7 @@ pub use codex_protocol::protocol::RealtimeAudioFrame;
pub use codex_protocol::protocol::RealtimeEvent;
pub use codex_protocol::protocol::RealtimeTranscriptDelta;
pub use codex_protocol::protocol::RealtimeTranscriptEntry;
pub use codex_protocol::protocol::RealtimeVoice;
use serde::Serialize;
use serde_json::Value;
@@ -26,6 +27,7 @@ pub struct RealtimeSessionConfig {
pub session_id: Option<String>,
pub event_parser: RealtimeEventParser,
pub session_mode: RealtimeSessionMode,
pub voice: RealtimeVoice,
}
#[derive(Debug, Clone, Serialize)]
@@ -106,15 +108,7 @@ pub(super) enum AudioFormatType {
pub(super) struct SessionAudioOutput {
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) format: Option<SessionAudioOutputFormat>,
pub(super) voice: SessionAudioVoice,
}
#[derive(Debug, Clone, Copy, Serialize)]
pub(super) enum SessionAudioVoice {
#[serde(rename = "fathom")]
Fathom,
#[serde(rename = "marin")]
Marin,
pub(super) voice: RealtimeVoice,
}
#[derive(Debug, Clone, Serialize)]

View File

@@ -11,6 +11,7 @@ use codex_api::RealtimeSessionMode;
use codex_api::RealtimeWebsocketClient;
use codex_api::RetryConfig;
use codex_protocol::protocol::RealtimeHandoffRequested;
use codex_protocol::protocol::RealtimeVoice;
use futures::SinkExt;
use futures::StreamExt;
use http::HeaderMap;
@@ -144,6 +145,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() {
session_id: Some("conv_123".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -246,6 +248,7 @@ async fn realtime_ws_connect_webrtc_sideband_retries_join_until_server_is_availa
session_id: Some("conv_123".to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Marin,
},
"rtc_test",
HeaderMap::new(),
@@ -316,6 +319,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() {
session_id: Some("conv_123".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -382,6 +386,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() {
session_id: Some("conv_123".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -444,6 +449,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() {
session_id: Some("conv_123".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -509,6 +515,7 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() {
session_id: Some("conv_123".to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Conversational,
voice: RealtimeVoice::Marin,
},
HeaderMap::new(),
HeaderMap::new(),