Add realtime output modality and transcript events (#17701)

- Add outputModality to thread/realtime/start and wire text/audio output
selection through app-server, core, API, and TUI.\n- Rename the realtime
transcript delta notification and add a separate transcript done
notification that forwards final text from item done without correlating
it with deltas.
This commit is contained in:
Ahmed Ibrahim
2026-04-14 00:13:13 -07:00
committed by GitHub
parent a6b03a22cc
commit 2f6fc7c137
38 changed files with 711 additions and 77 deletions

View File

@@ -134,6 +134,8 @@ pub struct McpServerRefreshConfig {
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct ConversationStartParams {
/// Selects whether the realtime session should produce text or audio output.
pub output_modality: RealtimeOutputModality,
#[serde(
default,
deserialize_with = "conversation_start_prompt_serde::deserialize",
@@ -157,6 +159,13 @@ pub enum ConversationStartTransport {
Webrtc { sdp: String },
}
#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeOutputModality {
Text,
Audio,
}
mod conversation_start_prompt_serde {
use serde::Deserializer;
use serde::Serializer;
@@ -290,6 +299,11 @@ pub struct RealtimeTranscriptDelta {
pub delta: String,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
pub struct RealtimeTranscriptDone {
pub text: String,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
pub struct RealtimeTranscriptEntry {
pub role: String,
@@ -332,7 +346,9 @@ pub enum RealtimeEvent {
},
InputAudioSpeechStarted(RealtimeInputAudioSpeechStarted),
InputTranscriptDelta(RealtimeTranscriptDelta),
InputTranscriptDone(RealtimeTranscriptDone),
OutputTranscriptDelta(RealtimeTranscriptDelta),
OutputTranscriptDone(RealtimeTranscriptDone),
AudioOut(RealtimeAudioFrame),
ResponseCreated(RealtimeResponseCreated),
ResponseCancelled(RealtimeResponseCancelled),
@@ -4594,12 +4610,14 @@ mod tests {
},
});
let start = Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("be helpful".to_string())),
session_id: Some("conv_1".to_string()),
transport: None,
voice: None,
});
let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("be helpful".to_string())),
session_id: Some("conv_1".to_string()),
transport: Some(ConversationStartTransport::Webrtc {
@@ -4612,12 +4630,14 @@ mod tests {
});
let close = Op::RealtimeConversationClose;
let default_prompt_start = Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: None,
session_id: None,
transport: None,
voice: None,
});
let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(None),
session_id: None,
transport: None,
@@ -4629,6 +4649,7 @@ mod tests {
serde_json::to_value(&start).unwrap(),
json!({
"type": "realtime_conversation_start",
"output_modality": "audio",
"prompt": "be helpful",
"session_id": "conv_1"
})
@@ -4636,19 +4657,22 @@ mod tests {
assert_eq!(
serde_json::to_value(&default_prompt_start).unwrap(),
json!({
"type": "realtime_conversation_start"
"type": "realtime_conversation_start",
"output_modality": "audio"
})
);
assert_eq!(
serde_json::to_value(&null_prompt_start).unwrap(),
json!({
"type": "realtime_conversation_start",
"output_modality": "audio",
"prompt": null
})
);
assert_eq!(
serde_json::from_value::<Op>(json!({
"type": "realtime_conversation_start"
"type": "realtime_conversation_start",
"output_modality": "audio"
}))
.unwrap(),
default_prompt_start
@@ -4656,6 +4680,7 @@ mod tests {
assert_eq!(
serde_json::from_value::<Op>(json!({
"type": "realtime_conversation_start",
"output_modality": "audio",
"prompt": null
}))
.unwrap(),
@@ -4701,6 +4726,7 @@ mod tests {
serde_json::to_value(&webrtc_start).unwrap(),
json!({
"type": "realtime_conversation_start",
"output_modality": "audio",
"prompt": "be helpful",
"session_id": "conv_1",
"transport": {