Update realtime websocket API (#13265)

- migrate the realtime websocket transport to the new session and
handoff flow
- make the realtime model configurable in config.toml and use API-key
auth for the websocket

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-02 16:05:40 -08:00
committed by GitHub
parent d473e8d56d
commit b20b6aa46f
21 changed files with 1449 additions and 507 deletions

View File

@@ -4,7 +4,10 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame;
use crate::endpoint::realtime_websocket::protocol::RealtimeEvent;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::SessionCreateSession;
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
use crate::endpoint::realtime_websocket::protocol::parse_realtime_event;
use crate::error::ApiError;
@@ -13,6 +16,8 @@ use codex_utils_rustls_provider::ensure_rustls_crypto_provider;
use futures::SinkExt;
use futures::StreamExt;
use http::HeaderMap;
use http::HeaderValue;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
@@ -205,23 +210,13 @@ impl RealtimeWebsocketConnection {
self.writer.send_conversation_item_create(text).await
}
pub async fn send_session_update(
pub async fn send_conversation_handoff_append(
&self,
backend_prompt: String,
conversation_id: Option<String>,
handoff_id: String,
output_text: String,
) -> Result<(), ApiError> {
self.writer
.send_session_update(backend_prompt, conversation_id)
.await
}
pub async fn send_session_create(
&self,
backend_prompt: String,
conversation_id: Option<String>,
) -> Result<(), ApiError> {
self.writer
.send_session_create(backend_prompt, conversation_id)
.send_conversation_handoff_append(handoff_id, output_text)
.await
}
@@ -262,13 +257,8 @@ impl RealtimeWebsocketConnection {
impl RealtimeWebsocketWriter {
pub async fn send_audio_frame(&self, frame: RealtimeAudioFrame) -> Result<(), ApiError> {
self.send_json(RealtimeOutboundMessage::InputAudioDelta {
delta: frame.data,
sample_rate: frame.sample_rate,
num_channels: frame.num_channels,
samples_per_channel: frame.samples_per_channel,
})
.await
self.send_json(RealtimeOutboundMessage::InputAudioBufferAppend { audio: frame.data })
.await
}
pub async fn send_conversation_item_create(&self, text: String) -> Result<(), ApiError> {
@@ -285,29 +275,34 @@ impl RealtimeWebsocketWriter {
.await
}
pub async fn send_session_update(
pub async fn send_conversation_handoff_append(
&self,
backend_prompt: String,
conversation_id: Option<String>,
handoff_id: String,
output_text: String,
) -> Result<(), ApiError> {
self.send_json(RealtimeOutboundMessage::SessionUpdate {
session: Some(SessionUpdateSession {
backend_prompt,
conversation_id,
}),
self.send_json(RealtimeOutboundMessage::ConversationHandoffAppend {
handoff_id,
output_text,
})
.await
}
pub async fn send_session_create(
&self,
backend_prompt: String,
conversation_id: Option<String>,
) -> Result<(), ApiError> {
self.send_json(RealtimeOutboundMessage::SessionCreate {
session: SessionCreateSession {
backend_prompt,
conversation_id,
pub async fn send_session_update(&self, instructions: String) -> Result<(), ApiError> {
self.send_json(RealtimeOutboundMessage::SessionUpdate {
session: SessionUpdateSession {
kind: "quicksilver".to_string(),
instructions,
audio: SessionAudio {
input: SessionAudioInput {
format: SessionAudioFormat {
kind: "audio/pcm".to_string(),
rate: 24_000,
},
},
output: SessionAudioOutput {
voice: "mundo".to_string(),
},
},
},
})
.await
@@ -413,14 +408,21 @@ impl RealtimeWebsocketClient {
default_headers: HeaderMap,
) -> Result<RealtimeWebsocketConnection, ApiError> {
ensure_rustls_crypto_provider();
// Keep provider base_url semantics aligned with HTTP clients; derive the ws endpoint here.
let ws_url = websocket_url_from_api_url(self.provider.base_url.as_str())?;
let ws_url = websocket_url_from_api_url(
self.provider.base_url.as_str(),
self.provider.query_params.as_ref(),
config.model.as_deref(),
)?;
let mut request = ws_url
.as_str()
.into_client_request()
.map_err(|err| ApiError::Stream(format!("failed to build websocket request: {err}")))?;
let headers = merge_request_headers(&self.provider.headers, extra_headers, default_headers);
let headers = merge_request_headers(
&self.provider.headers,
with_session_id_header(extra_headers, config.session_id.as_deref())?,
default_headers,
);
request.headers_mut().extend(headers);
info!("connecting realtime websocket: {ws_url}");
@@ -439,11 +441,12 @@ impl RealtimeWebsocketClient {
let (stream, rx_message) = WsStream::new(stream);
let connection = RealtimeWebsocketConnection::new(stream, rx_message);
debug!(
conversation_id = config.session_id.as_deref().unwrap_or("<none>"),
"realtime websocket sending session.create"
session_id = config.session_id.as_deref().unwrap_or("<none>"),
"realtime websocket sending session.update"
);
connection
.send_session_create(config.prompt, config.session_id)
.writer
.send_session_update(config.instructions)
.await?;
Ok(connection)
}
@@ -464,38 +467,99 @@ fn merge_request_headers(
headers
}
fn with_session_id_header(
mut headers: HeaderMap,
session_id: Option<&str>,
) -> Result<HeaderMap, ApiError> {
let Some(session_id) = session_id else {
return Ok(headers);
};
headers.insert(
"x-session-id",
HeaderValue::from_str(session_id).map_err(|err| {
ApiError::Stream(format!("invalid realtime session id header: {err}"))
})?,
);
Ok(headers)
}
fn websocket_config() -> WebSocketConfig {
WebSocketConfig::default()
}
fn websocket_url_from_api_url(api_url: &str) -> Result<Url, ApiError> {
fn websocket_url_from_api_url(
api_url: &str,
query_params: Option<&HashMap<String, String>>,
model: Option<&str>,
) -> Result<Url, ApiError> {
let mut url = Url::parse(api_url)
.map_err(|err| ApiError::Stream(format!("failed to parse realtime api_url: {err}")))?;
normalize_realtime_path(&mut url);
match url.scheme() {
"ws" | "wss" => {
if url.path().is_empty() || url.path() == "/" {
url.set_path("/ws");
}
Ok(url)
}
"ws" | "wss" => {}
"http" | "https" => {
if url.path().is_empty() || url.path() == "/" {
url.set_path("/ws");
}
let scheme = if url.scheme() == "http" { "ws" } else { "wss" };
let _ = url.set_scheme(scheme);
Ok(url)
}
scheme => Err(ApiError::Stream(format!(
"unsupported realtime api_url scheme: {scheme}"
))),
scheme => {
return Err(ApiError::Stream(format!(
"unsupported realtime api_url scheme: {scheme}"
)));
}
}
{
let mut query = url.query_pairs_mut();
query.append_pair("intent", "quicksilver");
if let Some(model) = model {
query.append_pair("model", model);
}
if let Some(query_params) = query_params {
for (key, value) in query_params {
if key == "intent" || (key == "model" && model.is_some()) {
continue;
}
query.append_pair(key, value);
}
}
}
Ok(url)
}
fn normalize_realtime_path(url: &mut Url) {
let path = url.path().to_string();
if path.is_empty() || path == "/" {
url.set_path("/v1/realtime");
return;
}
if path.ends_with("/realtime") {
return;
}
if path.ends_with("/realtime/") {
url.set_path(path.trim_end_matches('/'));
return;
}
if path.ends_with("/v1") {
url.set_path(&format!("{path}/realtime"));
return;
}
if path.ends_with("/v1/") {
url.set_path(&format!("{path}realtime"));
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::endpoint::realtime_websocket::protocol::RealtimeHandoffMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeHandoffRequested;
use http::HeaderValue;
use pretty_assertions::assert_eq;
use serde_json::Value;
@@ -507,17 +571,18 @@ mod tests {
use tokio_tungstenite::tungstenite::Message;
#[test]
fn parse_session_created_event() {
fn parse_session_updated_event() {
let payload = json!({
"type": "session.created",
"session": {"id": "sess_123"}
"type": "session.updated",
"session": {"id": "sess_123", "instructions": "backend prompt"}
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
Some(RealtimeEvent::SessionCreated {
session_id: "sess_123".to_string()
Some(RealtimeEvent::SessionUpdated {
session_id: "sess_123".to_string(),
instructions: Some("backend prompt".to_string()),
})
);
}
@@ -525,10 +590,10 @@ mod tests {
#[test]
fn parse_audio_delta_event() {
let payload = json!({
"type": "response.output_audio.delta",
"type": "conversation.output_audio.delta",
"delta": "AAA=",
"sample_rate": 48000,
"num_channels": 1,
"channels": 1,
"samples_per_channel": 960
})
.to_string();
@@ -547,17 +612,59 @@ mod tests {
fn parse_conversation_item_added_event() {
let payload = json!({
"type": "conversation.item.added",
"item": {"type": "spawn_transcript", "seq": 7}
"item": {"type": "message", "seq": 7}
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
Some(RealtimeEvent::ConversationItemAdded(
json!({"type": "spawn_transcript", "seq": 7})
json!({"type": "message", "seq": 7})
))
);
}
#[test]
fn parse_conversation_item_done_event() {
let payload = json!({
"type": "conversation.item.done",
"item": {"id": "item_123", "type": "message"}
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
Some(RealtimeEvent::ConversationItemDone {
item_id: "item_123".to_string(),
})
);
}
#[test]
fn parse_handoff_requested_event() {
let payload = json!({
"type": "conversation.handoff.requested",
"handoff_id": "handoff_123",
"item_id": "item_123",
"input_transcript": "delegate this",
"messages": [
{"role": "user", "text": "delegate this"}
]
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str()),
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
handoff_id: "handoff_123".to_string(),
item_id: "item_123".to_string(),
input_transcript: "delegate this".to_string(),
messages: vec![RealtimeHandoffMessage {
role: "user".to_string(),
text: "delegate this".to_string(),
}],
}))
);
}
#[test]
fn merge_request_headers_matches_http_precedence() {
let mut provider_headers = HeaderMap::new();
@@ -593,14 +700,61 @@ mod tests {
#[test]
fn websocket_url_from_http_base_defaults_to_ws_path() {
let url = websocket_url_from_api_url("http://127.0.0.1:8011").expect("build ws url");
assert_eq!(url.as_str(), "ws://127.0.0.1:8011/ws");
let url =
websocket_url_from_api_url("http://127.0.0.1:8011", None, None).expect("build ws url");
assert_eq!(
url.as_str(),
"ws://127.0.0.1:8011/v1/realtime?intent=quicksilver"
);
}
#[test]
fn websocket_url_from_ws_base_defaults_to_ws_path() {
let url = websocket_url_from_api_url("wss://example.com").expect("build ws url");
assert_eq!(url.as_str(), "wss://example.com/ws");
let url =
websocket_url_from_api_url("wss://example.com", None, Some("realtime-test-model"))
.expect("build ws url");
assert_eq!(
url.as_str(),
"wss://example.com/v1/realtime?intent=quicksilver&model=realtime-test-model"
);
}
#[test]
fn websocket_url_from_v1_base_appends_realtime_path() {
let url = websocket_url_from_api_url("https://api.openai.com/v1", None, Some("snapshot"))
.expect("build ws url");
assert_eq!(
url.as_str(),
"wss://api.openai.com/v1/realtime?intent=quicksilver&model=snapshot"
);
}
#[test]
fn websocket_url_from_nested_v1_base_appends_realtime_path() {
let url =
websocket_url_from_api_url("https://example.com/openai/v1", None, Some("snapshot"))
.expect("build ws url");
assert_eq!(
url.as_str(),
"wss://example.com/openai/v1/realtime?intent=quicksilver&model=snapshot"
);
}
#[test]
fn websocket_url_preserves_existing_realtime_path_and_extra_query_params() {
let url = websocket_url_from_api_url(
"https://example.com/v1/realtime?foo=bar",
Some(&HashMap::from([
("trace".to_string(), "1".to_string()),
("intent".to_string(), "ignored".to_string()),
])),
Some("snapshot"),
)
.expect("build ws url");
assert_eq!(
url.as_str(),
"wss://example.com/v1/realtime?foo=bar&intent=quicksilver&model=snapshot&trace=1"
);
}
#[tokio::test]
@@ -620,26 +774,38 @@ mod tests {
.into_text()
.expect("text");
let first_json: Value = serde_json::from_str(&first).expect("json");
assert_eq!(first_json["type"], "session.create");
assert_eq!(first_json["type"], "session.update");
assert_eq!(
first_json["session"]["backend_prompt"],
first_json["session"]["type"],
Value::String("quicksilver".to_string())
);
assert_eq!(
first_json["session"]["instructions"],
Value::String("backend prompt".to_string())
);
assert_eq!(
first_json["session"]["conversation_id"],
Value::String("conv_1".to_string())
first_json["session"]["audio"]["input"]["format"]["type"],
Value::String("audio/pcm".to_string())
);
assert_eq!(
first_json["session"]["audio"]["input"]["format"]["rate"],
Value::from(24_000)
);
assert_eq!(
first_json["session"]["audio"]["output"]["voice"],
Value::String("mundo".to_string())
);
ws.send(Message::Text(
json!({
"type": "session.created",
"session": {"id": "sess_mock"}
"type": "session.updated",
"session": {"id": "sess_mock", "instructions": "backend prompt"}
})
.to_string()
.into(),
))
.await
.expect("send session.created");
.expect("send session.updated");
let second = ws
.next()
@@ -649,7 +815,7 @@ mod tests {
.into_text()
.expect("text");
let second_json: Value = serde_json::from_str(&second).expect("json");
assert_eq!(second_json["type"], "response.input_audio.delta");
assert_eq!(second_json["type"], "input_audio_buffer.append");
let third = ws
.next()
@@ -662,12 +828,24 @@ mod tests {
assert_eq!(third_json["type"], "conversation.item.create");
assert_eq!(third_json["item"]["content"][0]["text"], "hello agent");
let fourth = ws
.next()
.await
.expect("fourth msg")
.expect("fourth msg ok")
.into_text()
.expect("text");
let fourth_json: Value = serde_json::from_str(&fourth).expect("json");
assert_eq!(fourth_json["type"], "conversation.handoff.append");
assert_eq!(fourth_json["handoff_id"], "handoff_1");
assert_eq!(fourth_json["output_text"], "hello from codex");
ws.send(Message::Text(
json!({
"type": "response.output_audio.delta",
"type": "conversation.output_audio.delta",
"delta": "AQID",
"sample_rate": 48000,
"num_channels": 1
"channels": 1
})
.to_string()
.into(),
@@ -677,8 +855,11 @@ mod tests {
ws.send(Message::Text(
json!({
"type": "conversation.item.added",
"item": {"type": "spawn_transcript", "seq": 2}
"type": "conversation.handoff.requested",
"handoff_id": "handoff_1",
"item_id": "item_2",
"input_transcript": "delegate now",
"messages": [{"role": "user", "text": "delegate now"}]
})
.to_string()
.into(),
@@ -705,7 +886,8 @@ mod tests {
let connection = client
.connect(
RealtimeSessionConfig {
prompt: "backend prompt".to_string(),
instructions: "backend prompt".to_string(),
model: Some("realtime-test-model".to_string()),
session_id: Some("conv_1".to_string()),
},
HeaderMap::new(),
@@ -721,8 +903,9 @@ mod tests {
.expect("event");
assert_eq!(
created,
RealtimeEvent::SessionCreated {
session_id: "sess_mock".to_string()
RealtimeEvent::SessionUpdated {
session_id: "sess_mock".to_string(),
instructions: Some("backend prompt".to_string()),
}
);
@@ -739,6 +922,13 @@ mod tests {
.send_conversation_item_create("hello agent".to_string())
.await
.expect("send item");
connection
.send_conversation_handoff_append(
"handoff_1".to_string(),
"hello from codex".to_string(),
)
.await
.expect("send handoff");
let audio_event = connection
.next_event()
@@ -762,10 +952,15 @@ mod tests {
.expect("event");
assert_eq!(
added_event,
RealtimeEvent::ConversationItemAdded(json!({
"type": "spawn_transcript",
"seq": 2
}))
RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
handoff_id: "handoff_1".to_string(),
item_id: "item_2".to_string(),
input_transcript: "delegate now".to_string(),
messages: vec![RealtimeHandoffMessage {
role: "user".to_string(),
text: "delegate now".to_string(),
}],
})
);
connection.close().await.expect("close");
@@ -789,7 +984,7 @@ mod tests {
.into_text()
.expect("text");
let first_json: Value = serde_json::from_str(&first).expect("json");
assert_eq!(first_json["type"], "session.create");
assert_eq!(first_json["type"], "session.update");
let second = ws
.next()
@@ -799,18 +994,18 @@ mod tests {
.into_text()
.expect("text");
let second_json: Value = serde_json::from_str(&second).expect("json");
assert_eq!(second_json["type"], "response.input_audio.delta");
assert_eq!(second_json["type"], "input_audio_buffer.append");
ws.send(Message::Text(
json!({
"type": "session.created",
"session": {"id": "sess_after_send"}
"type": "session.updated",
"session": {"id": "sess_after_send", "instructions": "backend prompt"}
})
.to_string()
.into(),
))
.await
.expect("send session.created");
.expect("send session.updated");
});
let provider = Provider {
@@ -831,7 +1026,8 @@ mod tests {
let connection = client
.connect(
RealtimeSessionConfig {
prompt: "backend prompt".to_string(),
instructions: "backend prompt".to_string(),
model: Some("realtime-test-model".to_string()),
session_id: Some("conv_1".to_string()),
},
HeaderMap::new(),
@@ -862,8 +1058,9 @@ mod tests {
let next_event = next_result.expect("next event").expect("event");
assert_eq!(
next_event,
RealtimeEvent::SessionCreated {
session_id: "sess_after_send".to_string()
RealtimeEvent::SessionUpdated {
session_id: "sess_after_send".to_string(),
instructions: Some("backend prompt".to_string()),
}
);