Add realtime transcription mode for websocket sessions (#14556)

- add experimental_realtime_ws_mode (conversational/transcription) and
plumb it into realtime conversation session config
- switch realtime websocket intent and session.update payload shape
based on mode
- update config schema and realtime/config tests

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-12 23:50:30 -07:00
committed by GitHub
parent eaf81d3f6f
commit 2253a9d1d7
9 changed files with 482 additions and 63 deletions

View File

@@ -463,6 +463,9 @@ pub struct Config {
/// Experimental / do not use. Selects the realtime websocket model/snapshot
/// used for the `Op::RealtimeConversation` connection.
pub experimental_realtime_ws_model: Option<String>,
/// Experimental / do not use. Selects the realtime websocket intent mode.
/// `conversational` is speech-to-speech while `transcription` is transcript-only.
pub experimental_realtime_ws_mode: RealtimeWsMode,
/// Experimental / do not use. Overrides only the realtime conversation
/// websocket transport instructions (the `Op::RealtimeConversation`
/// `/ws` session.update instructions) without changing normal prompts.
@@ -1238,6 +1241,9 @@ pub struct ConfigToml {
/// Experimental / do not use. Selects the realtime websocket model/snapshot
/// used for the `Op::RealtimeConversation` connection.
pub experimental_realtime_ws_model: Option<String>,
/// Experimental / do not use. Selects the realtime websocket intent mode.
/// `conversational` is speech-to-speech while `transcription` is transcript-only.
pub experimental_realtime_ws_mode: Option<RealtimeWsMode>,
/// Experimental / do not use. Overrides only the realtime conversation
/// websocket transport instructions (the `Op::RealtimeConversation`
/// `/ws` session.update instructions) without changing normal prompts.
@@ -1383,6 +1389,14 @@ pub struct RealtimeAudioConfig {
pub speaker: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeWsMode {
#[default]
Conversational,
Transcription,
}
#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)]
#[schemars(deny_unknown_fields)]
pub struct RealtimeAudioToml {
@@ -2462,6 +2476,7 @@ impl Config {
}),
experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url,
experimental_realtime_ws_model: cfg.experimental_realtime_ws_model,
experimental_realtime_ws_mode: cfg.experimental_realtime_ws_mode.unwrap_or_default(),
experimental_realtime_ws_backend_prompt: cfg.experimental_realtime_ws_backend_prompt,
experimental_realtime_ws_startup_context: cfg.experimental_realtime_ws_startup_context,
experimental_realtime_start_instructions: cfg.experimental_realtime_start_instructions,