[stack 2/4] Align main realtime v2 wire and runtime flow (#14830)

## Stack Position
2/4. Built on top of #14828.

## Base
- #14828

## Unblocks
- #14829
- #14827

## Scope
- Port the realtime v2 wire parsing, session, app-server, and
conversation runtime behavior onto the split websocket-method base.
- Branch runtime behavior directly on the current realtime session kind
instead of parser-derived flow flags.
- Keep regression coverage in the existing e2e suites.

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-16 21:38:07 -07:00
committed by GitHub
parent 1d85fe79ed
commit fbd7f9b986
28 changed files with 807 additions and 140 deletions

View File

@@ -1,31 +1,42 @@
use crate::endpoint::realtime_websocket::methods_common::REALTIME_AUDIO_FORMAT;
use crate::endpoint::realtime_websocket::methods_common::REALTIME_AUDIO_SAMPLE_RATE;
use crate::endpoint::realtime_websocket::protocol::AudioFormatType;
use crate::endpoint::realtime_websocket::protocol::ConversationContentType;
use crate::endpoint::realtime_websocket::protocol::ConversationFunctionCallOutputItem;
use crate::endpoint::realtime_websocket::protocol::ConversationItemContent;
use crate::endpoint::realtime_websocket::protocol::ConversationItemPayload;
use crate::endpoint::realtime_websocket::protocol::ConversationItemType;
use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem;
use crate::endpoint::realtime_websocket::protocol::ConversationRole;
use crate::endpoint::realtime_websocket::protocol::NoiseReductionType;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputFormat;
use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice;
use crate::endpoint::realtime_websocket::protocol::SessionFunctionTool;
use crate::endpoint::realtime_websocket::protocol::SessionNoiseReduction;
use crate::endpoint::realtime_websocket::protocol::SessionToolType;
use crate::endpoint::realtime_websocket::protocol::SessionTurnDetection;
use crate::endpoint::realtime_websocket::protocol::SessionType;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
use crate::endpoint::realtime_websocket::protocol::TurnDetectionType;
use serde_json::json;
const REALTIME_V2_SESSION_TYPE: &str = "realtime";
const REALTIME_V2_OUTPUT_MODALITY_AUDIO: &str = "audio";
const REALTIME_V2_TOOL_CHOICE: &str = "auto";
const REALTIME_V2_CODEX_TOOL_NAME: &str = "codex";
const REALTIME_V2_CODEX_TOOL_DESCRIPTION: &str = "Delegate work to Codex and return the result.";
const REALTIME_V2_CODEX_TOOL_DESCRIPTION: &str = "Delegate a request to Codex and return the final result to the user. Use this as the default action. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later.";
pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutboundMessage {
RealtimeOutboundMessage::ConversationItemCreate {
item: ConversationItemPayload::Message(ConversationMessageItem {
kind: "message".to_string(),
role: "user".to_string(),
r#type: ConversationItemType::Message,
role: ConversationRole::User,
content: vec![ConversationItemContent {
kind: "input_text".to_string(),
r#type: ConversationContentType::InputText,
text,
}],
}),
@@ -38,7 +49,7 @@ pub(super) fn conversation_handoff_append_message(
) -> RealtimeOutboundMessage {
RealtimeOutboundMessage::ConversationItemCreate {
item: ConversationItemPayload::FunctionCallOutput(ConversationFunctionCallOutputItem {
kind: "function_call_output".to_string(),
r#type: ConversationItemType::FunctionCallOutput,
call_id: handoff_id,
output: output_text,
}),
@@ -51,21 +62,34 @@ pub(super) fn session_update_session(
) -> SessionUpdateSession {
match session_mode {
RealtimeSessionMode::Conversational => SessionUpdateSession {
kind: REALTIME_V2_SESSION_TYPE.to_string(),
r#type: SessionType::Realtime,
instructions: Some(instructions),
output_modalities: Some(vec![REALTIME_V2_OUTPUT_MODALITY_AUDIO.to_string()]),
audio: SessionAudio {
input: SessionAudioInput {
format: SessionAudioFormat {
kind: REALTIME_AUDIO_FORMAT.to_string(),
r#type: AudioFormatType::AudioPcm,
rate: REALTIME_AUDIO_SAMPLE_RATE,
},
noise_reduction: Some(SessionNoiseReduction {
r#type: NoiseReductionType::NearField,
}),
turn_detection: Some(SessionTurnDetection {
r#type: TurnDetectionType::ServerVad,
interrupt_response: true,
create_response: true,
}),
},
output: Some(SessionAudioOutput {
voice: SessionAudioVoice::Alloy,
format: Some(SessionAudioOutputFormat {
r#type: AudioFormatType::AudioPcm,
rate: REALTIME_AUDIO_SAMPLE_RATE,
}),
voice: SessionAudioVoice::Marin,
}),
},
tools: Some(vec![SessionFunctionTool {
kind: "function".to_string(),
r#type: SessionToolType::Function,
name: REALTIME_V2_CODEX_TOOL_NAME.to_string(),
description: REALTIME_V2_CODEX_TOOL_DESCRIPTION.to_string(),
parameters: json!({
@@ -73,27 +97,32 @@ pub(super) fn session_update_session(
"properties": {
"prompt": {
"type": "string",
"description": "Prompt text for the delegated Codex task."
"description": "The user request to delegate to Codex."
}
},
"required": ["prompt"],
"additionalProperties": false
}),
}]),
tool_choice: Some(REALTIME_V2_TOOL_CHOICE.to_string()),
},
RealtimeSessionMode::Transcription => SessionUpdateSession {
kind: "transcription".to_string(),
r#type: SessionType::Transcription,
instructions: None,
output_modalities: None,
audio: SessionAudio {
input: SessionAudioInput {
format: SessionAudioFormat {
kind: REALTIME_AUDIO_FORMAT.to_string(),
r#type: AudioFormatType::AudioPcm,
rate: REALTIME_AUDIO_SAMPLE_RATE,
},
noise_reduction: None,
turn_detection: None,
},
output: None,
},
tools: None,
tool_choice: None,
},
}
}