Add realtime transcription mode for websocket sessions (#14556)

- add experimental_realtime_ws_mode (conversational/transcription) and
plumb it into realtime conversation session config
- switch realtime websocket intent and session.update payload shape
based on mode
- update config schema and realtime/config tests

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-12 23:50:30 -07:00
committed by GitHub
parent eaf81d3f6f
commit 2253a9d1d7
9 changed files with 482 additions and 63 deletions

View File

@@ -14,12 +14,19 @@ pub enum RealtimeEventParser {
RealtimeV2,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RealtimeSessionMode {
Conversational,
Transcription,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RealtimeSessionConfig {
pub instructions: String,
pub model: Option<String>,
pub session_id: Option<String>,
pub event_parser: RealtimeEventParser,
pub session_mode: RealtimeSessionMode,
}
#[derive(Debug, Clone, Serialize)]
@@ -42,7 +49,8 @@ pub(super) enum RealtimeOutboundMessage {
pub(super) struct SessionUpdateSession {
#[serde(rename = "type")]
pub(super) kind: String,
pub(super) instructions: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) instructions: Option<String>,
pub(super) audio: SessionAudio,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) tools: Option<Vec<SessionFunctionTool>>,
@@ -51,7 +59,8 @@ pub(super) struct SessionUpdateSession {
#[derive(Debug, Clone, Serialize)]
pub(super) struct SessionAudio {
pub(super) input: SessionAudioInput,
pub(super) output: SessionAudioOutput,
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) output: Option<SessionAudioOutput>,
}
#[derive(Debug, Clone, Serialize)]