Add realtime v2 event parser behind feature flag (#14537)

- Add a feature-flagged realtime v2 parser on the existing
websocket/session pipeline.
- Wire parser selection from core feature flags and map the codex
handoff tool-call path into existing handoff events.

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-12 21:12:40 -07:00
committed by GitHub
parent 650beb177e
commit 3e8f47169e
9 changed files with 352 additions and 10 deletions

View File

@@ -2,6 +2,7 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItem;
use crate::endpoint::realtime_websocket::protocol::ConversationItemContent;
use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame;
use crate::endpoint::realtime_websocket::protocol::RealtimeEvent;
use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta;
@@ -202,6 +203,7 @@ pub struct RealtimeWebsocketWriter {
pub struct RealtimeWebsocketEvents {
rx_message: Arc<Mutex<mpsc::UnboundedReceiver<Result<Message, WsError>>>>,
active_transcript: Arc<Mutex<ActiveTranscriptState>>,
event_parser: RealtimeEventParser,
is_closed: Arc<AtomicBool>,
}
@@ -248,6 +250,7 @@ impl RealtimeWebsocketConnection {
fn new(
stream: WsStream,
rx_message: mpsc::UnboundedReceiver<Result<Message, WsError>>,
event_parser: RealtimeEventParser,
) -> Self {
let stream = Arc::new(stream);
let is_closed = Arc::new(AtomicBool::new(false));
@@ -259,6 +262,7 @@ impl RealtimeWebsocketConnection {
events: RealtimeWebsocketEvents {
rx_message: Arc::new(Mutex::new(rx_message)),
active_transcript: Arc::new(Mutex::new(ActiveTranscriptState::default())),
event_parser,
is_closed,
},
}
@@ -376,7 +380,7 @@ impl RealtimeWebsocketEvents {
match msg {
Message::Text(text) => {
if let Some(mut event) = parse_realtime_event(&text) {
if let Some(mut event) = parse_realtime_event(&text, self.event_parser) {
self.update_active_transcript(&mut event).await;
debug!(?event, "realtime websocket parsed event");
return Ok(Some(event));
@@ -495,7 +499,7 @@ impl RealtimeWebsocketClient {
);
let (stream, rx_message) = WsStream::new(stream);
let connection = RealtimeWebsocketConnection::new(stream, rx_message);
let connection = RealtimeWebsocketConnection::new(stream, rx_message, config.event_parser);
debug!(
session_id = config.session_id.as_deref().unwrap_or("<none>"),
"realtime websocket sending session.update"
@@ -636,7 +640,7 @@ mod tests {
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::SessionUpdated {
session_id: "sess_123".to_string(),
instructions: Some("backend prompt".to_string()),
@@ -655,7 +659,7 @@ mod tests {
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
data: "AAA=".to_string(),
sample_rate: 48000,
@@ -673,7 +677,7 @@ mod tests {
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::ConversationItemAdded(
json!({"type": "message", "seq": 7})
))
@@ -688,7 +692,7 @@ mod tests {
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::ConversationItemDone {
item_id: "item_123".to_string(),
})
@@ -706,7 +710,7 @@ mod tests {
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
handoff_id: "handoff_123".to_string(),
item_id: "item_123".to_string(),
@@ -725,7 +729,7 @@ mod tests {
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::InputTranscriptDelta(
RealtimeTranscriptDelta {
delta: "hello ".to_string(),
@@ -743,7 +747,7 @@ mod tests {
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
parse_realtime_event(payload.as_str(), RealtimeEventParser::V1),
Some(RealtimeEvent::OutputTranscriptDelta(
RealtimeTranscriptDelta {
delta: "hi".to_string(),
@@ -752,6 +756,68 @@ mod tests {
);
}
#[test]
fn parse_realtime_v2_handoff_tool_call_event() {
let payload = json!({
"type": "conversation.item.done",
"item": {
"id": "item_123",
"type": "function_call",
"name": "codex",
"call_id": "call_123",
"arguments": "{\"prompt\":\"delegate this\"}"
}
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2),
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
handoff_id: "call_123".to_string(),
item_id: "item_123".to_string(),
input_transcript: "delegate this".to_string(),
active_transcript: Vec::new(),
}))
);
}
#[test]
fn parse_realtime_v2_input_audio_transcription_delta_event() {
let payload = json!({
"type": "conversation.item.input_audio_transcription.delta",
"delta": "hello"
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2),
Some(RealtimeEvent::InputTranscriptDelta(
RealtimeTranscriptDelta {
delta: "hello".to_string(),
}
))
);
}
#[test]
fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() {
let payload = json!({
"type": "response.output_audio.delta",
"delta": "AQID"
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2),
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
data: "AQID".to_string(),
sample_rate: 24_000,
num_channels: 1,
samples_per_channel: None,
}))
);
}
#[test]
fn merge_request_headers_matches_http_precedence() {
let mut provider_headers = HeaderMap::new();
@@ -1008,6 +1074,7 @@ mod tests {
instructions: "backend prompt".to_string(),
model: Some("realtime-test-model".to_string()),
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
},
HeaderMap::new(),
HeaderMap::new(),
@@ -1190,6 +1257,7 @@ mod tests {
instructions: "backend prompt".to_string(),
model: Some("realtime-test-model".to_string()),
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
},
HeaderMap::new(),
HeaderMap::new(),

View File

@@ -1,5 +1,6 @@
pub mod methods;
pub mod protocol;
mod protocol_v2;
pub use codex_protocol::protocol::RealtimeAudioFrame;
pub use codex_protocol::protocol::RealtimeEvent;
@@ -7,4 +8,5 @@ pub use methods::RealtimeWebsocketClient;
pub use methods::RealtimeWebsocketConnection;
pub use methods::RealtimeWebsocketEvents;
pub use methods::RealtimeWebsocketWriter;
pub use protocol::RealtimeEventParser;
pub use protocol::RealtimeSessionConfig;

View File

@@ -1,3 +1,4 @@
use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2;
pub use codex_protocol::protocol::RealtimeAudioFrame;
pub use codex_protocol::protocol::RealtimeEvent;
pub use codex_protocol::protocol::RealtimeHandoffRequested;
@@ -7,11 +8,18 @@ use serde::Serialize;
use serde_json::Value;
use tracing::debug;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RealtimeEventParser {
V1,
RealtimeV2,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RealtimeSessionConfig {
pub instructions: String,
pub model: Option<String>,
pub session_id: Option<String>,
pub event_parser: RealtimeEventParser,
}
#[derive(Debug, Clone, Serialize)]
@@ -76,7 +84,17 @@ pub(super) struct ConversationItemContent {
pub(super) text: String,
}
pub(super) fn parse_realtime_event(payload: &str) -> Option<RealtimeEvent> {
pub(super) fn parse_realtime_event(
payload: &str,
event_parser: RealtimeEventParser,
) -> Option<RealtimeEvent> {
match event_parser {
RealtimeEventParser::V1 => parse_realtime_event_v1(payload),
RealtimeEventParser::RealtimeV2 => parse_realtime_event_v2(payload),
}
}
fn parse_realtime_event_v1(payload: &str) -> Option<RealtimeEvent> {
let parsed: Value = match serde_json::from_str(payload) {
Ok(msg) => msg,
Err(err) => {

View File

@@ -0,0 +1,157 @@
use codex_protocol::protocol::RealtimeAudioFrame;
use codex_protocol::protocol::RealtimeEvent;
use codex_protocol::protocol::RealtimeHandoffRequested;
use codex_protocol::protocol::RealtimeTranscriptDelta;
use serde_json::Value;
use tracing::debug;
pub(super) fn parse_realtime_event_v2(payload: &str) -> Option<RealtimeEvent> {
let parsed: Value = match serde_json::from_str(payload) {
Ok(msg) => msg,
Err(err) => {
debug!("failed to parse realtime v2 event: {err}, data: {payload}");
return None;
}
};
let message_type = match parsed.get("type").and_then(Value::as_str) {
Some(message_type) => message_type,
None => {
debug!("received realtime v2 event without type field: {payload}");
return None;
}
};
match message_type {
"session.updated" => {
let session_id = parsed
.get("session")
.and_then(Value::as_object)
.and_then(|session| session.get("id"))
.and_then(Value::as_str)
.map(str::to_string);
let instructions = parsed
.get("session")
.and_then(Value::as_object)
.and_then(|session| session.get("instructions"))
.and_then(Value::as_str)
.map(str::to_string);
session_id.map(|session_id| RealtimeEvent::SessionUpdated {
session_id,
instructions,
})
}
"response.output_audio.delta" => {
let data = parsed
.get("delta")
.and_then(Value::as_str)
.map(str::to_string)?;
let sample_rate = parsed
.get("sample_rate")
.and_then(Value::as_u64)
.and_then(|value| u32::try_from(value).ok())
.unwrap_or(24_000);
let num_channels = parsed
.get("channels")
.or_else(|| parsed.get("num_channels"))
.and_then(Value::as_u64)
.and_then(|value| u16::try_from(value).ok())
.unwrap_or(1);
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
data,
sample_rate,
num_channels,
samples_per_channel: parsed
.get("samples_per_channel")
.and_then(Value::as_u64)
.and_then(|value| u32::try_from(value).ok()),
}))
}
"conversation.item.input_audio_transcription.delta" => parsed
.get("delta")
.and_then(Value::as_str)
.map(str::to_string)
.map(|delta| RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta })),
"conversation.item.input_audio_transcription.completed" => parsed
.get("transcript")
.and_then(Value::as_str)
.map(str::to_string)
.map(|delta| RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta })),
"response.output_text.delta" | "response.output_audio_transcript.delta" => parsed
.get("delta")
.and_then(Value::as_str)
.map(str::to_string)
.map(|delta| RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta })),
"conversation.item.added" => parsed
.get("item")
.cloned()
.map(RealtimeEvent::ConversationItemAdded),
"conversation.item.done" => {
let item = parsed.get("item")?.as_object()?;
let item_type = item.get("type").and_then(Value::as_str);
let item_name = item.get("name").and_then(Value::as_str);
if item_type == Some("function_call") && item_name == Some("codex") {
let call_id = item
.get("call_id")
.and_then(Value::as_str)
.or_else(|| item.get("id").and_then(Value::as_str))?;
let item_id = item
.get("id")
.and_then(Value::as_str)
.unwrap_or(call_id)
.to_string();
let arguments = item.get("arguments").and_then(Value::as_str).unwrap_or("");
let mut input_transcript = String::new();
if !arguments.is_empty() {
if let Ok(arguments_json) = serde_json::from_str::<Value>(arguments)
&& let Some(arguments_object) = arguments_json.as_object()
{
for key in ["input_transcript", "input", "text", "prompt", "query"] {
if let Some(value) = arguments_object.get(key).and_then(Value::as_str) {
let trimmed = value.trim();
if !trimmed.is_empty() {
input_transcript = trimmed.to_string();
break;
}
}
}
}
if input_transcript.is_empty() {
input_transcript = arguments.to_string();
}
}
return Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
handoff_id: call_id.to_string(),
item_id,
input_transcript,
active_transcript: Vec::new(),
}));
}
item.get("id")
.and_then(Value::as_str)
.map(str::to_string)
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id })
}
"error" => parsed
.get("message")
.and_then(Value::as_str)
.map(str::to_string)
.or_else(|| {
parsed
.get("error")
.and_then(Value::as_object)
.and_then(|error| error.get("message"))
.and_then(Value::as_str)
.map(str::to_string)
})
.or_else(|| parsed.get("error").map(ToString::to_string))
.map(RealtimeEvent::Error),
_ => {
debug!("received unsupported realtime v2 event type: {message_type}, data: {payload}");
None
}
}
}