mirror of
https://github.com/openai/codex.git
synced 2026-05-05 03:47:01 +00:00
## Summary - set the realtime v2 server VAD silence delay to 500ms - update the default realtime 1.5 backend prompt to the v4 text - keep the session payload and prompt rendering tests aligned with those changes ## Why - the VAD change gives the voice path a longer pause before ending the user's turn - the prompt change makes the default bundled realtime prompt match the current v4 content ## Validation - `cargo +1.93.0 test -p codex-core realtime_prompt --manifest-path /tmp/codex-realtime-v2-vad-prompt-v4/codex-rs/Cargo.toml` - `CARGO_TARGET_DIR=/tmp/codex-pr-v4-target cargo +1.93.0 test -p codex-api realtime_v2_session_update_includes_background_agent_tool_and_handoff_output_item --manifest-path /tmp/codex-realtime-v2-vad-prompt-v4/codex-rs/Cargo.toml` - `CARGO_TARGET_DIR=/tmp/codex-pr-v4-target cargo +1.93.0 test -p codex-app-server --test all 'suite::v2::realtime_conversation::realtime_webrtc_start_emits_sdp_notification' --manifest-path /tmp/codex-realtime-v2-vad-prompt-v4/codex-rs/Cargo.toml -- --exact`
223 lines
6.4 KiB
Rust
223 lines
6.4 KiB
Rust
use crate::endpoint::realtime_websocket::protocol_v1::parse_realtime_event_v1;
|
|
use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2;
|
|
pub use codex_protocol::protocol::RealtimeAudioFrame;
|
|
pub use codex_protocol::protocol::RealtimeEvent;
|
|
pub use codex_protocol::protocol::RealtimeOutputModality;
|
|
pub use codex_protocol::protocol::RealtimeTranscriptEntry;
|
|
pub use codex_protocol::protocol::RealtimeVoice;
|
|
use serde::Serialize;
|
|
use serde_json::Value;
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum RealtimeEventParser {
|
|
V1,
|
|
RealtimeV2,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum RealtimeSessionMode {
|
|
Conversational,
|
|
Transcription,
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct RealtimeSessionConfig {
|
|
pub instructions: String,
|
|
pub model: Option<String>,
|
|
pub session_id: Option<String>,
|
|
pub event_parser: RealtimeEventParser,
|
|
pub session_mode: RealtimeSessionMode,
|
|
pub output_modality: RealtimeOutputModality,
|
|
pub voice: RealtimeVoice,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
#[serde(tag = "type")]
|
|
pub(super) enum RealtimeOutboundMessage {
|
|
#[serde(rename = "input_audio_buffer.append")]
|
|
InputAudioBufferAppend { audio: String },
|
|
#[serde(rename = "conversation.handoff.append")]
|
|
ConversationHandoffAppend {
|
|
handoff_id: String,
|
|
output_text: String,
|
|
},
|
|
#[serde(rename = "response.create")]
|
|
ResponseCreate,
|
|
#[serde(rename = "session.update")]
|
|
SessionUpdate { session: SessionUpdateSession },
|
|
#[serde(rename = "conversation.item.create")]
|
|
ConversationItemCreate { item: ConversationItemPayload },
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionUpdateSession {
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) id: Option<String>,
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: SessionType,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) model: Option<String>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) instructions: Option<String>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) output_modalities: Option<Vec<String>>,
|
|
pub(super) audio: SessionAudio,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) tools: Option<Vec<SessionFunctionTool>>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) tool_choice: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum SessionType {
|
|
Quicksilver,
|
|
Realtime,
|
|
Transcription,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionAudio {
|
|
pub(super) input: SessionAudioInput,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) output: Option<SessionAudioOutput>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionAudioInput {
|
|
pub(super) format: SessionAudioFormat,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) noise_reduction: Option<SessionNoiseReduction>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) turn_detection: Option<SessionTurnDetection>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionAudioFormat {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: AudioFormatType,
|
|
pub(super) rate: u32,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
pub(super) enum AudioFormatType {
|
|
#[serde(rename = "audio/pcm")]
|
|
AudioPcm,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionAudioOutput {
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub(super) format: Option<SessionAudioOutputFormat>,
|
|
pub(super) voice: RealtimeVoice,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionNoiseReduction {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: NoiseReductionType,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum NoiseReductionType {
|
|
NearField,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionTurnDetection {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: TurnDetectionType,
|
|
pub(super) interrupt_response: bool,
|
|
pub(super) create_response: bool,
|
|
pub(super) silence_duration_ms: u32,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum TurnDetectionType {
|
|
ServerVad,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionAudioOutputFormat {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: AudioFormatType,
|
|
pub(super) rate: u32,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct ConversationMessageItem {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: ConversationItemType,
|
|
pub(super) role: ConversationRole,
|
|
pub(super) content: Vec<ConversationItemContent>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum ConversationItemType {
|
|
Message,
|
|
FunctionCallOutput,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum ConversationRole {
|
|
User,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
#[serde(untagged)]
|
|
pub(super) enum ConversationItemPayload {
|
|
Message(ConversationMessageItem),
|
|
FunctionCallOutput(ConversationFunctionCallOutputItem),
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct ConversationFunctionCallOutputItem {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: ConversationItemType,
|
|
pub(super) call_id: String,
|
|
pub(super) output: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct ConversationItemContent {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: ConversationContentType,
|
|
pub(super) text: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum ConversationContentType {
|
|
Text,
|
|
InputText,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub(super) struct SessionFunctionTool {
|
|
#[serde(rename = "type")]
|
|
pub(super) r#type: SessionToolType,
|
|
pub(super) name: String,
|
|
pub(super) description: String,
|
|
pub(super) parameters: Value,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Serialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub(super) enum SessionToolType {
|
|
Function,
|
|
}
|
|
|
|
pub(super) fn parse_realtime_event(
|
|
payload: &str,
|
|
event_parser: RealtimeEventParser,
|
|
) -> Option<RealtimeEvent> {
|
|
match event_parser {
|
|
RealtimeEventParser::V1 => parse_realtime_event_v1(payload),
|
|
RealtimeEventParser::RealtimeV2 => parse_realtime_event_v2(payload),
|
|
}
|
|
}
|