Add realtime voice selection (#17176)

- Add realtime voice selection for realtime/start.
- Expose the supported v1/v2 voice lists and cover explicit, configured,
default, and invalid voice paths.
This commit is contained in:
Ahmed Ibrahim
2026-04-08 20:19:15 -07:00
committed by GitHub
parent 4c2a1ae31b
commit 2f9090be62
36 changed files with 860 additions and 33 deletions

View File

@@ -144,6 +144,8 @@ pub struct ConversationStartParams {
pub session_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub transport: Option<ConversationStartTransport>,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<RealtimeVoice>,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
@@ -176,6 +178,101 @@ mod conversation_start_prompt_serde {
}
}
#[derive(
Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Hash, JsonSchema, TS, Ord, PartialOrd,
)]
#[serde(rename_all = "snake_case")]
#[ts(rename_all = "snake_case")]
pub enum RealtimeVoice {
Alloy,
Arbor,
Ash,
Ballad,
Breeze,
Cedar,
Coral,
Cove,
Echo,
Ember,
Juniper,
Maple,
Marin,
Sage,
Shimmer,
Sol,
Spruce,
Vale,
Verse,
}
impl RealtimeVoice {
pub fn wire_name(self) -> &'static str {
match self {
Self::Alloy => "alloy",
Self::Arbor => "arbor",
Self::Ash => "ash",
Self::Ballad => "ballad",
Self::Breeze => "breeze",
Self::Cedar => "cedar",
Self::Coral => "coral",
Self::Cove => "cove",
Self::Echo => "echo",
Self::Ember => "ember",
Self::Juniper => "juniper",
Self::Maple => "maple",
Self::Marin => "marin",
Self::Sage => "sage",
Self::Shimmer => "shimmer",
Self::Sol => "sol",
Self::Spruce => "spruce",
Self::Vale => "vale",
Self::Verse => "verse",
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(rename_all = "camelCase")]
pub struct RealtimeVoicesList {
pub v1: Vec<RealtimeVoice>,
pub v2: Vec<RealtimeVoice>,
pub default_v1: RealtimeVoice,
pub default_v2: RealtimeVoice,
}
impl RealtimeVoicesList {
pub fn builtin() -> Self {
Self {
v1: vec![
RealtimeVoice::Juniper,
RealtimeVoice::Maple,
RealtimeVoice::Spruce,
RealtimeVoice::Ember,
RealtimeVoice::Vale,
RealtimeVoice::Breeze,
RealtimeVoice::Arbor,
RealtimeVoice::Sol,
RealtimeVoice::Cove,
],
v2: vec![
RealtimeVoice::Alloy,
RealtimeVoice::Ash,
RealtimeVoice::Ballad,
RealtimeVoice::Coral,
RealtimeVoice::Echo,
RealtimeVoice::Sage,
RealtimeVoice::Shimmer,
RealtimeVoice::Verse,
RealtimeVoice::Marin,
RealtimeVoice::Cedar,
],
default_v1: RealtimeVoice::Cove,
default_v2: RealtimeVoice::Marin,
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
pub struct RealtimeAudioFrame {
pub data: String,
@@ -271,6 +368,9 @@ pub enum Op {
/// Close the running realtime conversation stream.
RealtimeConversationClose,
/// Request the list of voices supported by realtime conversation streams.
RealtimeConversationListVoices,
/// Legacy user input.
///
/// Prefer [`Op::UserTurn`] so the caller provides full turn context
@@ -617,6 +717,7 @@ impl Op {
Self::RealtimeConversationAudio(_) => "realtime_conversation_audio",
Self::RealtimeConversationText(_) => "realtime_conversation_text",
Self::RealtimeConversationClose => "realtime_conversation_close",
Self::RealtimeConversationListVoices => "realtime_conversation_list_voices",
Self::UserInput { .. } => "user_input",
Self::UserTurn { .. } => "user_turn",
Self::InterAgentCommunication { .. } => "inter_agent_communication",
@@ -1398,6 +1499,9 @@ pub enum EventMsg {
/// List of skills available to the agent.
ListSkillsResponse(ListSkillsResponseEvent),
/// List of voices supported by realtime conversation streams.
RealtimeConversationListVoicesResponse(RealtimeConversationListVoicesResponseEvent),
/// Notification that skill data may have been updated and clients may want to reload.
SkillsUpdateAvailable,
@@ -3147,6 +3251,11 @@ pub struct ListSkillsResponseEvent {
pub skills: Vec<SkillsListEntry>,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
pub struct RealtimeConversationListVoicesResponseEvent {
pub voices: RealtimeVoicesList,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, TS)]
#[serde(rename_all = "lowercase")]
#[ts(rename_all = "lowercase")]
@@ -4441,6 +4550,7 @@ mod tests {
prompt: Some(Some("be helpful".to_string())),
session_id: Some("conv_1".to_string()),
transport: None,
voice: None,
});
let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams {
prompt: Some(Some("be helpful".to_string())),
@@ -4448,6 +4558,7 @@ mod tests {
transport: Some(ConversationStartTransport::Webrtc {
sdp: "v=offer\r\n".to_string(),
}),
voice: Some(RealtimeVoice::Cove),
});
let text = Op::RealtimeConversationText(ConversationTextParams {
text: "hello".to_string(),
@@ -4457,12 +4568,15 @@ mod tests {
prompt: None,
session_id: None,
transport: None,
voice: None,
});
let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams {
prompt: Some(None),
session_id: None,
transport: None,
voice: None,
});
let list_voices = Op::RealtimeConversationListVoices;
assert_eq!(
serde_json::to_value(&start).unwrap(),
@@ -4526,6 +4640,16 @@ mod tests {
serde_json::from_value::<Op>(serde_json::to_value(&close).unwrap()).unwrap(),
close
);
assert_eq!(
serde_json::to_value(&list_voices).unwrap(),
json!({
"type": "realtime_conversation_list_voices"
})
);
assert_eq!(
serde_json::from_value::<Op>(serde_json::to_value(&list_voices).unwrap()).unwrap(),
list_voices
);
assert_eq!(
serde_json::to_value(&webrtc_start).unwrap(),
json!({
@@ -4535,11 +4659,46 @@ mod tests {
"transport": {
"type": "webrtc",
"sdp": "v=offer\r\n"
}
},
"voice": "cove"
})
);
}
#[test]
fn realtime_voice_list_is_stable() {
assert_eq!(
RealtimeVoicesList::builtin(),
RealtimeVoicesList {
v1: vec![
RealtimeVoice::Juniper,
RealtimeVoice::Maple,
RealtimeVoice::Spruce,
RealtimeVoice::Ember,
RealtimeVoice::Vale,
RealtimeVoice::Breeze,
RealtimeVoice::Arbor,
RealtimeVoice::Sol,
RealtimeVoice::Cove,
],
v2: vec![
RealtimeVoice::Alloy,
RealtimeVoice::Ash,
RealtimeVoice::Ballad,
RealtimeVoice::Coral,
RealtimeVoice::Echo,
RealtimeVoice::Sage,
RealtimeVoice::Shimmer,
RealtimeVoice::Verse,
RealtimeVoice::Marin,
RealtimeVoice::Cedar,
],
default_v1: RealtimeVoice::Cove,
default_v2: RealtimeVoice::Marin,
}
);
}
#[test]
fn user_input_serialization_omits_final_output_json_schema_when_none() -> Result<()> {
let op = Op::UserInput {