From d6d03d42ea20dfb4ab2b84bbbff4755b455afac1 Mon Sep 17 00:00:00 2001 From: guinness-oai Date: Wed, 20 May 2026 16:03:51 -0700 Subject: [PATCH] [codex] Fix realtime v1 websocket compatibility (#23771) ## Why Realtime v1 websocket sessions now expect a slightly different boundary shape for text input, completed input transcripts, and connection headers. Codex was still using the older shape, so some v1 text appends could be rejected before the existing conversation flow could handle them. ## What changed - Send v1 user text items with `input_text` content - Accept v1 turn-marked input transcript events as completed transcripts - Add the v1 alpha header only for v1 realtime sessions - Cover the outbound text shape, transcript parsing, and versioned headers ## Test plan - `cargo test -p codex-api endpoint::realtime_websocket::methods::tests` - `cargo test -p codex-core quicksilver_alpha_header` --- .../endpoint/realtime_websocket/methods.rs | 20 ++++++++++++++ .../endpoint/realtime_websocket/methods_v1.rs | 2 +- .../endpoint/realtime_websocket/protocol.rs | 1 - .../realtime_websocket/protocol_v1.rs | 3 ++- codex-rs/core/src/realtime_conversation.rs | 7 +++++ .../core/src/realtime_conversation_tests.rs | 27 +++++++++++++++++++ 6 files changed, 57 insertions(+), 3 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 9fcca1c3e3..ad26549a26 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -993,6 +993,22 @@ mod tests { ); } + #[test] + fn parse_v1_input_transcript_turn_marked_event() { + let payload = json!({ + "type": "conversation.input_transcript.turn_marked", + "transcript": "hello realtime" + }) + .to_string(); + + assert_eq!( + parse_realtime_event(payload.as_str(), RealtimeEventParser::V1), + Some(RealtimeEvent::InputTranscriptDone(RealtimeTranscriptDone { + text: "hello realtime".to_string(), + })) + ); + } + #[test] fn parse_output_transcript_delta_event() { let payload = json!({ @@ -1581,6 +1597,10 @@ mod tests { .expect("text"); let third_json: Value = serde_json::from_str(&third).expect("json"); assert_eq!(third_json["type"], "conversation.item.create"); + assert_eq!( + third_json["item"]["content"][0]["type"], + Value::String("input_text".to_string()) + ); assert_eq!(third_json["item"]["content"][0]["text"], "hello agent"); let fourth = ws diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs index 19e4fa203a..0f1a269082 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs @@ -21,7 +21,7 @@ pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutbound r#type: ConversationItemType::Message, role: ConversationRole::User, content: vec![ConversationItemContent { - r#type: ConversationContentType::Text, + r#type: ConversationContentType::InputText, text, }], }), diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index d689f6ea96..5df4c0c503 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -199,7 +199,6 @@ pub(super) struct ConversationItemContent { #[derive(Debug, Clone, Copy, Serialize)] #[serde(rename_all = "snake_case")] pub(super) enum ConversationContentType { - Text, InputText, } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs index 3c1d25aed7..a464852244 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v1.rs @@ -43,7 +43,8 @@ pub(super) fn parse_realtime_event_v1(payload: &str) -> Option { | "conversation.item.input_audio_transcription.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::InputTranscriptDelta) } - "conversation.item.input_audio_transcription.completed" => { + "conversation.input_transcript.turn_marked" + | "conversation.item.input_audio_transcription.completed" => { parse_transcript_done_event(&parsed, "transcript") .map(RealtimeEvent::InputTranscriptDone) } diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index 249b3ae15f..7f71142e19 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -641,12 +641,14 @@ async fn prepare_realtime_start( realtime_request_headers( requested_realtime_session_id.as_deref(), Some(realtime_api_key.as_str()), + version, )? } ConversationStartTransport::Webrtc { .. } => { realtime_request_headers( requested_realtime_session_id.as_deref(), /*api_key*/ None, + version, )? } }; @@ -973,9 +975,14 @@ fn realtime_api_key(auth: Option<&CodexAuth>, provider: &ModelProviderInfo) -> C fn realtime_request_headers( realtime_session_id: Option<&str>, api_key: Option<&str>, + version: RealtimeWsVersion, ) -> CodexResult> { let mut headers = HeaderMap::new(); + if version == RealtimeWsVersion::V1 { + headers.insert("openai-alpha", HeaderValue::from_static("quicksilver=v1")); + } + if let Some(realtime_session_id) = realtime_session_id && let Ok(realtime_session_id) = HeaderValue::from_str(realtime_session_id) { diff --git a/codex-rs/core/src/realtime_conversation_tests.rs b/codex-rs/core/src/realtime_conversation_tests.rs index a146c43869..b67205ef8f 100644 --- a/codex-rs/core/src/realtime_conversation_tests.rs +++ b/codex-rs/core/src/realtime_conversation_tests.rs @@ -1,9 +1,11 @@ use super::RealtimeHandoffState; use super::RealtimeSessionKind; use super::realtime_delegation_from_handoff; +use super::realtime_request_headers; use super::realtime_text_from_handoff_request; use super::wrap_realtime_delegation_input; use async_channel::bounded; +use codex_config::config_toml::RealtimeWsVersion; use codex_protocol::protocol::RealtimeHandoffRequested; use codex_protocol::protocol::RealtimeTranscriptEntry; use pretty_assertions::assert_eq; @@ -137,3 +139,28 @@ async fn clears_active_handoff_explicitly() { *state.active_handoff.lock().await = None; assert_eq!(state.active_handoff.lock().await.clone(), None); } + +#[test] +fn uses_quicksilver_alpha_header_for_realtime_v1() { + let headers = + realtime_request_headers(Some("session_1"), Some("sk-test"), RealtimeWsVersion::V1) + .expect("headers") + .expect("headers"); + + assert_eq!( + headers + .get("openai-alpha") + .and_then(|value| value.to_str().ok()), + Some("quicksilver=v1") + ); +} + +#[test] +fn omits_quicksilver_alpha_header_for_realtime_v2() { + let headers = + realtime_request_headers(Some("session_1"), Some("sk-test"), RealtimeWsVersion::V2) + .expect("headers") + .expect("headers"); + + assert!(headers.get("openai-alpha").is_none()); +}