[stack 2/4] Align main realtime v2 wire and runtime flow (#14830)

## Stack Position
2/4. Built on top of #14828.

## Base
- #14828

## Unblocks
- #14829
- #14827

## Scope
- Port the realtime v2 wire parsing, session, app-server, and
conversation runtime behavior onto the split websocket-method base.
- Branch runtime behavior directly on the current realtime session kind
instead of parser-derived flow flags.
- Keep regression coverage in the existing e2e suites.

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-16 21:38:07 -07:00
committed by GitHub
parent 1d85fe79ed
commit fbd7f9b986
28 changed files with 807 additions and 140 deletions

View File

@@ -39,6 +39,8 @@ pub(super) enum RealtimeOutboundMessage {
handoff_id: String,
output_text: String,
},
#[serde(rename = "response.create")]
ResponseCreate,
#[serde(rename = "session.update")]
SessionUpdate { session: SessionUpdateSession },
#[serde(rename = "conversation.item.create")]
@@ -48,12 +50,24 @@ pub(super) enum RealtimeOutboundMessage {
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionUpdateSession {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) r#type: SessionType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) instructions: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) output_modalities: Option<Vec<String>>,
pub(super) audio: SessionAudio,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) tools: Option<Vec<SessionFunctionTool>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) tool_choice: Option<String>,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum SessionType {
Quicksilver,
Realtime,
Transcription,
}
#[derive(Debug, Clone, Serialize)]
@@ -66,17 +80,29 @@ pub(super) struct SessionAudio {
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionAudioInput {
pub(super) format: SessionAudioFormat,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) noise_reduction: Option<SessionNoiseReduction>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) turn_detection: Option<SessionTurnDetection>,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionAudioFormat {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) r#type: AudioFormatType,
pub(super) rate: u32,
}
#[derive(Debug, Clone, Copy, Serialize)]
pub(super) enum AudioFormatType {
#[serde(rename = "audio/pcm")]
AudioPcm,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionAudioOutput {
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) format: Option<SessionAudioOutputFormat>,
pub(super) voice: SessionAudioVoice,
}
@@ -84,18 +110,64 @@ pub(super) struct SessionAudioOutput {
pub(super) enum SessionAudioVoice {
#[serde(rename = "fathom")]
Fathom,
#[serde(rename = "alloy")]
Alloy,
#[serde(rename = "marin")]
Marin,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionNoiseReduction {
#[serde(rename = "type")]
pub(super) r#type: NoiseReductionType,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum NoiseReductionType {
NearField,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionTurnDetection {
#[serde(rename = "type")]
pub(super) r#type: TurnDetectionType,
pub(super) interrupt_response: bool,
pub(super) create_response: bool,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum TurnDetectionType {
ServerVad,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionAudioOutputFormat {
#[serde(rename = "type")]
pub(super) r#type: AudioFormatType,
pub(super) rate: u32,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct ConversationMessageItem {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) role: String,
pub(super) r#type: ConversationItemType,
pub(super) role: ConversationRole,
pub(super) content: Vec<ConversationItemContent>,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum ConversationItemType {
Message,
FunctionCallOutput,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum ConversationRole {
User,
}
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
pub(super) enum ConversationItemPayload {
@@ -106,7 +178,7 @@ pub(super) enum ConversationItemPayload {
#[derive(Debug, Clone, Serialize)]
pub(super) struct ConversationFunctionCallOutputItem {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) r#type: ConversationItemType,
pub(super) call_id: String,
pub(super) output: String,
}
@@ -114,19 +186,32 @@ pub(super) struct ConversationFunctionCallOutputItem {
#[derive(Debug, Clone, Serialize)]
pub(super) struct ConversationItemContent {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) r#type: ConversationContentType,
pub(super) text: String,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum ConversationContentType {
Text,
InputText,
}
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionFunctionTool {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) r#type: SessionToolType,
pub(super) name: String,
pub(super) description: String,
pub(super) parameters: Value,
}
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "snake_case")]
pub(super) enum SessionToolType {
Function,
}
pub(super) fn parse_realtime_event(
payload: &str,
event_parser: RealtimeEventParser,