mirror of
https://github.com/openai/codex.git
synced 2026-04-29 00:55:38 +00:00
Add realtime voice selection (#17176)
- Add realtime voice selection for realtime/start. - Expose the supported v1/v2 voice lists and cover explicit, configured, default, and invalid voice paths.
This commit is contained in:
@@ -226,6 +226,7 @@ mod tests {
|
||||
use codex_client::Response;
|
||||
use codex_client::StreamResponse;
|
||||
use codex_client::TransportError;
|
||||
use codex_protocol::protocol::RealtimeVoice;
|
||||
use http::StatusCode;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::sync::Mutex;
|
||||
@@ -308,6 +309,7 @@ mod tests {
|
||||
session_id: Some(session_id.to_string()),
|
||||
event_parser: RealtimeEventParser::RealtimeV2,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Marin,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
|
||||
use crate::endpoint::realtime_websocket::protocol::parse_realtime_event;
|
||||
use crate::error::ApiError;
|
||||
use crate::provider::Provider;
|
||||
@@ -306,9 +307,10 @@ impl RealtimeWebsocketWriter {
|
||||
&self,
|
||||
instructions: String,
|
||||
session_mode: RealtimeSessionMode,
|
||||
voice: RealtimeVoice,
|
||||
) -> Result<(), ApiError> {
|
||||
let session_mode = normalized_session_mode(self.event_parser, session_mode);
|
||||
let session = session_update_session(self.event_parser, instructions, session_mode);
|
||||
let session = session_update_session(self.event_parser, instructions, session_mode, voice);
|
||||
self.send_json(&RealtimeOutboundMessage::SessionUpdate { session })
|
||||
.await
|
||||
}
|
||||
@@ -577,7 +579,7 @@ impl RealtimeWebsocketClient {
|
||||
);
|
||||
connection
|
||||
.writer
|
||||
.send_session_update(config.instructions, config.session_mode)
|
||||
.send_session_update(config.instructions, config.session_mode, config.voice)
|
||||
.await?;
|
||||
Ok(connection)
|
||||
}
|
||||
@@ -722,6 +724,7 @@ mod tests {
|
||||
use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use codex_protocol::protocol::RealtimeInputAudioSpeechStarted;
|
||||
use codex_protocol::protocol::RealtimeResponseCancelled;
|
||||
use codex_protocol::protocol::RealtimeVoice;
|
||||
use http::HeaderValue;
|
||||
use pretty_assertions::assert_eq;
|
||||
use serde_json::Value;
|
||||
@@ -1238,7 +1241,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["output"]["voice"],
|
||||
Value::String("fathom".to_string())
|
||||
Value::String("breeze".to_string())
|
||||
);
|
||||
|
||||
ws.send(Message::Text(
|
||||
@@ -1371,6 +1374,7 @@ mod tests {
|
||||
session_id: Some("conv_1".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Breeze,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -1546,7 +1550,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["output"]["voice"],
|
||||
Value::String("marin".to_string())
|
||||
Value::String("cedar".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][0]["type"],
|
||||
@@ -1644,6 +1648,7 @@ mod tests {
|
||||
session_id: Some("conv_1".to_string()),
|
||||
event_parser: RealtimeEventParser::RealtimeV2,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Cedar,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -1748,6 +1753,7 @@ mod tests {
|
||||
session_id: Some("conv_1".to_string()),
|
||||
event_parser: RealtimeEventParser::RealtimeV2,
|
||||
session_mode: RealtimeSessionMode::Transcription,
|
||||
voice: RealtimeVoice::Marin,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -1811,7 +1817,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["output"]["voice"],
|
||||
Value::String("fathom".to_string())
|
||||
Value::String("cove".to_string())
|
||||
);
|
||||
assert!(first_json["session"].get("tools").is_none());
|
||||
|
||||
@@ -1850,6 +1856,7 @@ mod tests {
|
||||
session_id: Some("conv_1".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Transcription,
|
||||
voice: RealtimeVoice::Cove,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -1938,6 +1945,7 @@ mod tests {
|
||||
session_id: Some("conv_1".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Cove,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
|
||||
use serde_json::Result as JsonResult;
|
||||
use serde_json::Value;
|
||||
@@ -56,11 +57,14 @@ pub(super) fn session_update_session(
|
||||
event_parser: RealtimeEventParser,
|
||||
instructions: String,
|
||||
session_mode: RealtimeSessionMode,
|
||||
voice: RealtimeVoice,
|
||||
) -> SessionUpdateSession {
|
||||
let session_mode = normalized_session_mode(event_parser, session_mode);
|
||||
match event_parser {
|
||||
RealtimeEventParser::V1 => v1_session_update_session(instructions),
|
||||
RealtimeEventParser::RealtimeV2 => v2_session_update_session(instructions, session_mode),
|
||||
RealtimeEventParser::V1 => v1_session_update_session(instructions, voice),
|
||||
RealtimeEventParser::RealtimeV2 => {
|
||||
v2_session_update_session(instructions, session_mode, voice)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +73,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult<
|
||||
config.event_parser,
|
||||
config.instructions,
|
||||
config.session_mode,
|
||||
config.voice,
|
||||
);
|
||||
session.id = config.session_id;
|
||||
session.model = config.model;
|
||||
|
||||
@@ -7,11 +7,11 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemType;
|
||||
use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem;
|
||||
use crate::endpoint::realtime_websocket::protocol::ConversationRole;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionType;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
|
||||
|
||||
@@ -38,7 +38,10 @@ pub(super) fn conversation_handoff_append_message(
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn session_update_session(instructions: String) -> SessionUpdateSession {
|
||||
pub(super) fn session_update_session(
|
||||
instructions: String,
|
||||
voice: RealtimeVoice,
|
||||
) -> SessionUpdateSession {
|
||||
SessionUpdateSession {
|
||||
id: None,
|
||||
r#type: SessionType::Quicksilver,
|
||||
@@ -56,7 +59,7 @@ pub(super) fn session_update_session(instructions: String) -> SessionUpdateSessi
|
||||
},
|
||||
output: Some(SessionAudioOutput {
|
||||
format: None,
|
||||
voice: SessionAudioVoice::Fathom,
|
||||
voice,
|
||||
}),
|
||||
},
|
||||
tools: None,
|
||||
|
||||
@@ -10,12 +10,12 @@ use crate::endpoint::realtime_websocket::protocol::ConversationRole;
|
||||
use crate::endpoint::realtime_websocket::protocol::NoiseReductionType;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionFunctionTool;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionNoiseReduction;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolType;
|
||||
@@ -59,6 +59,7 @@ pub(super) fn conversation_handoff_append_message(
|
||||
pub(super) fn session_update_session(
|
||||
instructions: String,
|
||||
session_mode: RealtimeSessionMode,
|
||||
voice: RealtimeVoice,
|
||||
) -> SessionUpdateSession {
|
||||
match session_mode {
|
||||
RealtimeSessionMode::Conversational => SessionUpdateSession {
|
||||
@@ -87,7 +88,7 @@ pub(super) fn session_update_session(
|
||||
r#type: AudioFormatType::AudioPcm,
|
||||
rate: REALTIME_AUDIO_SAMPLE_RATE,
|
||||
}),
|
||||
voice: SessionAudioVoice::Marin,
|
||||
voice,
|
||||
}),
|
||||
},
|
||||
tools: Some(vec![SessionFunctionTool {
|
||||
|
||||
@@ -4,6 +4,7 @@ pub use codex_protocol::protocol::RealtimeAudioFrame;
|
||||
pub use codex_protocol::protocol::RealtimeEvent;
|
||||
pub use codex_protocol::protocol::RealtimeTranscriptDelta;
|
||||
pub use codex_protocol::protocol::RealtimeTranscriptEntry;
|
||||
pub use codex_protocol::protocol::RealtimeVoice;
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
|
||||
@@ -26,6 +27,7 @@ pub struct RealtimeSessionConfig {
|
||||
pub session_id: Option<String>,
|
||||
pub event_parser: RealtimeEventParser,
|
||||
pub session_mode: RealtimeSessionMode,
|
||||
pub voice: RealtimeVoice,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -106,15 +108,7 @@ pub(super) enum AudioFormatType {
|
||||
pub(super) struct SessionAudioOutput {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) format: Option<SessionAudioOutputFormat>,
|
||||
pub(super) voice: SessionAudioVoice,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize)]
|
||||
pub(super) enum SessionAudioVoice {
|
||||
#[serde(rename = "fathom")]
|
||||
Fathom,
|
||||
#[serde(rename = "marin")]
|
||||
Marin,
|
||||
pub(super) voice: RealtimeVoice,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
|
||||
@@ -11,6 +11,7 @@ use codex_api::RealtimeSessionMode;
|
||||
use codex_api::RealtimeWebsocketClient;
|
||||
use codex_api::RetryConfig;
|
||||
use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use codex_protocol::protocol::RealtimeVoice;
|
||||
use futures::SinkExt;
|
||||
use futures::StreamExt;
|
||||
use http::HeaderMap;
|
||||
@@ -144,6 +145,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() {
|
||||
session_id: Some("conv_123".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Cove,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -246,6 +248,7 @@ async fn realtime_ws_connect_webrtc_sideband_retries_join_until_server_is_availa
|
||||
session_id: Some("conv_123".to_string()),
|
||||
event_parser: RealtimeEventParser::RealtimeV2,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Marin,
|
||||
},
|
||||
"rtc_test",
|
||||
HeaderMap::new(),
|
||||
@@ -316,6 +319,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() {
|
||||
session_id: Some("conv_123".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Cove,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -382,6 +386,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() {
|
||||
session_id: Some("conv_123".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Cove,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -444,6 +449,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() {
|
||||
session_id: Some("conv_123".to_string()),
|
||||
event_parser: RealtimeEventParser::V1,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Cove,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -509,6 +515,7 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() {
|
||||
session_id: Some("conv_123".to_string()),
|
||||
event_parser: RealtimeEventParser::RealtimeV2,
|
||||
session_mode: RealtimeSessionMode::Conversational,
|
||||
voice: RealtimeVoice::Marin,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
|
||||
Reference in New Issue
Block a user