Add realtime transcription mode for websocket sessions (#14556)

- add experimental_realtime_ws_mode (conversational/transcription) and plumb it into realtime conversation session config - switch realtime websocket intent and session.update payload shape based on mode - update config schema and realtime/config tests --------- Co-authored-by: Codex <noreply@openai.com>
2026-04-29 00:55:38 +00:00 · 2026-03-12 23:50:30 -07:00
parent eaf81d3f6f
commit 2253a9d1d7
9 changed files with 482 additions and 63 deletions
--- a/codex-rs/core/src/config/mod.rs
+++ b/codex-rs/core/src/config/mod.rs
@@ -463,6 +463,9 @@ pub struct Config {
    /// Experimental / do not use. Selects the realtime websocket model/snapshot
    /// used for the `Op::RealtimeConversation` connection.
    pub experimental_realtime_ws_model: Option<String>,
+    /// Experimental / do not use. Selects the realtime websocket intent mode.
+    /// `conversational` is speech-to-speech while `transcription` is transcript-only.
+    pub experimental_realtime_ws_mode: RealtimeWsMode,
    /// Experimental / do not use. Overrides only the realtime conversation
    /// websocket transport instructions (the `Op::RealtimeConversation`
    /// `/ws` session.update instructions) without changing normal prompts.
@@ -1238,6 +1241,9 @@ pub struct ConfigToml {
    /// Experimental / do not use. Selects the realtime websocket model/snapshot
    /// used for the `Op::RealtimeConversation` connection.
    pub experimental_realtime_ws_model: Option<String>,
+    /// Experimental / do not use. Selects the realtime websocket intent mode.
+    /// `conversational` is speech-to-speech while `transcription` is transcript-only.
+    pub experimental_realtime_ws_mode: Option<RealtimeWsMode>,
    /// Experimental / do not use. Overrides only the realtime conversation
    /// websocket transport instructions (the `Op::RealtimeConversation`
    /// `/ws` session.update instructions) without changing normal prompts.
@@ -1383,6 +1389,14 @@ pub struct RealtimeAudioConfig {
    pub speaker: Option<String>,
 }

+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+pub enum RealtimeWsMode {
+    #[default]
+    Conversational,
+    Transcription,
+}
+
 #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)]
 #[schemars(deny_unknown_fields)]
 pub struct RealtimeAudioToml {
@@ -2462,6 +2476,7 @@ impl Config {
                }),
            experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url,
            experimental_realtime_ws_model: cfg.experimental_realtime_ws_model,
+            experimental_realtime_ws_mode: cfg.experimental_realtime_ws_mode.unwrap_or_default(),
            experimental_realtime_ws_backend_prompt: cfg.experimental_realtime_ws_backend_prompt,
            experimental_realtime_ws_startup_context: cfg.experimental_realtime_ws_startup_context,
            experimental_realtime_start_instructions: cfg.experimental_realtime_start_instructions,