mirror of
https://github.com/openai/codex.git
synced 2026-05-02 10:26:45 +00:00
Add realtime output modality and transcript events (#17701)
- Add outputModality to thread/realtime/start and wire text/audio output selection through app-server, core, API, and TUI.\n- Rename the realtime transcript delta notification and add a separate transcript done notification that forwards final text from item done without correlating it with deltas.
This commit is contained in:
@@ -134,6 +134,8 @@ pub struct McpServerRefreshConfig {
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct ConversationStartParams {
|
||||
/// Selects whether the realtime session should produce text or audio output.
|
||||
pub output_modality: RealtimeOutputModality,
|
||||
#[serde(
|
||||
default,
|
||||
deserialize_with = "conversation_start_prompt_serde::deserialize",
|
||||
@@ -157,6 +159,13 @@ pub enum ConversationStartTransport {
|
||||
Webrtc { sdp: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum RealtimeOutputModality {
|
||||
Text,
|
||||
Audio,
|
||||
}
|
||||
|
||||
mod conversation_start_prompt_serde {
|
||||
use serde::Deserializer;
|
||||
use serde::Serializer;
|
||||
@@ -290,6 +299,11 @@ pub struct RealtimeTranscriptDelta {
|
||||
pub delta: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
pub struct RealtimeTranscriptDone {
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
pub struct RealtimeTranscriptEntry {
|
||||
pub role: String,
|
||||
@@ -332,7 +346,9 @@ pub enum RealtimeEvent {
|
||||
},
|
||||
InputAudioSpeechStarted(RealtimeInputAudioSpeechStarted),
|
||||
InputTranscriptDelta(RealtimeTranscriptDelta),
|
||||
InputTranscriptDone(RealtimeTranscriptDone),
|
||||
OutputTranscriptDelta(RealtimeTranscriptDelta),
|
||||
OutputTranscriptDone(RealtimeTranscriptDone),
|
||||
AudioOut(RealtimeAudioFrame),
|
||||
ResponseCreated(RealtimeResponseCreated),
|
||||
ResponseCancelled(RealtimeResponseCancelled),
|
||||
@@ -4594,12 +4610,14 @@ mod tests {
|
||||
},
|
||||
});
|
||||
let start = Op::RealtimeConversationStart(ConversationStartParams {
|
||||
output_modality: RealtimeOutputModality::Audio,
|
||||
prompt: Some(Some("be helpful".to_string())),
|
||||
session_id: Some("conv_1".to_string()),
|
||||
transport: None,
|
||||
voice: None,
|
||||
});
|
||||
let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams {
|
||||
output_modality: RealtimeOutputModality::Audio,
|
||||
prompt: Some(Some("be helpful".to_string())),
|
||||
session_id: Some("conv_1".to_string()),
|
||||
transport: Some(ConversationStartTransport::Webrtc {
|
||||
@@ -4612,12 +4630,14 @@ mod tests {
|
||||
});
|
||||
let close = Op::RealtimeConversationClose;
|
||||
let default_prompt_start = Op::RealtimeConversationStart(ConversationStartParams {
|
||||
output_modality: RealtimeOutputModality::Audio,
|
||||
prompt: None,
|
||||
session_id: None,
|
||||
transport: None,
|
||||
voice: None,
|
||||
});
|
||||
let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams {
|
||||
output_modality: RealtimeOutputModality::Audio,
|
||||
prompt: Some(None),
|
||||
session_id: None,
|
||||
transport: None,
|
||||
@@ -4629,6 +4649,7 @@ mod tests {
|
||||
serde_json::to_value(&start).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_start",
|
||||
"output_modality": "audio",
|
||||
"prompt": "be helpful",
|
||||
"session_id": "conv_1"
|
||||
})
|
||||
@@ -4636,19 +4657,22 @@ mod tests {
|
||||
assert_eq!(
|
||||
serde_json::to_value(&default_prompt_start).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_start"
|
||||
"type": "realtime_conversation_start",
|
||||
"output_modality": "audio"
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::to_value(&null_prompt_start).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_start",
|
||||
"output_modality": "audio",
|
||||
"prompt": null
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::from_value::<Op>(json!({
|
||||
"type": "realtime_conversation_start"
|
||||
"type": "realtime_conversation_start",
|
||||
"output_modality": "audio"
|
||||
}))
|
||||
.unwrap(),
|
||||
default_prompt_start
|
||||
@@ -4656,6 +4680,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
serde_json::from_value::<Op>(json!({
|
||||
"type": "realtime_conversation_start",
|
||||
"output_modality": "audio",
|
||||
"prompt": null
|
||||
}))
|
||||
.unwrap(),
|
||||
@@ -4701,6 +4726,7 @@ mod tests {
|
||||
serde_json::to_value(&webrtc_start).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_start",
|
||||
"output_modality": "audio",
|
||||
"prompt": "be helpful",
|
||||
"session_id": "conv_1",
|
||||
"transport": {
|
||||
|
||||
Reference in New Issue
Block a user