Add WebRTC transport to realtime start (#16960)

Adds WebRTC startup to the experimental app-server
`thread/realtime/start` method with an optional transport enum. The
websocket path remains the default; WebRTC offers create the realtime
session through the shared start flow and emit the answer SDP via
`thread/realtime/sdp`.

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-04-07 15:43:38 -07:00
committed by GitHub
parent 6c36e7d688
commit fb3dcfde1d
42 changed files with 1574 additions and 85 deletions

View File

@@ -118,6 +118,7 @@ async fn start_realtime_conversation(codex: &codex_core::CodexThread) -> Result<
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;

View File

@@ -7,6 +7,7 @@ use codex_protocol::ThreadId;
use codex_protocol::protocol::CodexErrorInfo;
use codex_protocol::protocol::ConversationAudioParams;
use codex_protocol::protocol::ConversationStartParams;
use codex_protocol::protocol::ConversationStartTransport;
use codex_protocol::protocol::ConversationTextParams;
use codex_protocol::protocol::ErrorEvent;
use codex_protocol::protocol::EventMsg;
@@ -32,15 +33,56 @@ use serde_json::Value;
use serde_json::json;
use std::fs;
use std::process::Command;
use std::sync::Arc;
use std::sync::Mutex;
use std::time::Duration;
use tokio::sync::oneshot;
use tokio::time::timeout;
use wiremock::Match;
use wiremock::Mock;
use wiremock::Request as WiremockRequest;
use wiremock::ResponseTemplate;
use wiremock::matchers::method;
use wiremock::matchers::path_regex;
const STARTUP_CONTEXT_HEADER: &str = "Startup context from Codex.";
const MEMORY_PROMPT_PHRASE: &str =
"You have access to a memory folder with guidance from prior runs.";
const REALTIME_CONVERSATION_TEST_SUBPROCESS_ENV_VAR: &str =
"CODEX_REALTIME_CONVERSATION_TEST_SUBPROCESS";
#[derive(Debug, Clone)]
struct RealtimeCallRequestCapture {
requests: Arc<Mutex<Vec<WiremockRequest>>>,
}
impl RealtimeCallRequestCapture {
fn new() -> Self {
Self {
requests: Arc::new(Mutex::new(Vec::new())),
}
}
fn single_request(&self) -> WiremockRequest {
let requests = self
.requests
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
assert_eq!(requests.len(), 1, "expected one realtime call request");
requests[0].clone()
}
}
impl Match for RealtimeCallRequestCapture {
fn matches(&self, request: &WiremockRequest) -> bool {
self.requests
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner)
.push(request.clone());
true
}
}
fn websocket_request_text(
request: &core_test_support::responses::WebSocketRequest,
) -> Option<String> {
@@ -182,6 +224,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -292,6 +335,89 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_webrtc_start_posts_generated_session() -> Result<()> {
let server = start_mock_server().await;
let capture = RealtimeCallRequestCapture::new();
Mock::given(method("POST"))
.and(path_regex(".*/realtime/calls$"))
.and(capture.clone())
.respond_with(ResponseTemplate::new(200).set_body_string("v=answer\r\n"))
.mount(&server)
.await;
let mut builder = test_codex().with_config(|config| {
config.experimental_realtime_ws_backend_prompt = Some("backend prompt".to_string());
config.experimental_realtime_ws_model = Some("realtime-test-model".to_string());
config.experimental_realtime_ws_startup_context = Some("startup context".to_string());
});
let test = builder.build(&server).await?;
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: Some(ConversationStartTransport::Webrtc {
sdp: "v=offer\r\n".to_string(),
}),
}))
.await?;
let created = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationSdp(created) => Some(Ok(created.clone())),
EventMsg::Error(err) => Some(Err(err.clone())),
_ => None,
})
.await
.unwrap_or_else(|err: ErrorEvent| panic!("conversation call create failed: {err:?}"));
assert_eq!(created.sdp, "v=answer\r\n");
let closed = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationClosed(closed) => Some(closed.clone()),
_ => None,
})
.await;
assert_eq!(closed.reason.as_deref(), Some("transport_closed"));
let request = capture.single_request();
assert_eq!(request.url.path(), "/v1/realtime/calls");
assert_eq!(request.url.query(), None);
assert_eq!(
request
.headers
.get("authorization")
.and_then(|value| value.to_str().ok()),
Some("Bearer dummy")
);
assert_eq!(
request
.headers
.get("content-type")
.and_then(|value| value.to_str().ok()),
Some("multipart/form-data; boundary=codex-realtime-call-boundary")
);
let body = String::from_utf8(request.body).context("multipart body should be utf-8")?;
let session = r#"{"audio":{"input":{"format":{"type":"audio/pcm","rate":24000}},"output":{"voice":"fathom"}},"type":"quicksilver","model":"realtime-test-model","instructions":"backend prompt\n\nstartup context"}"#;
assert_eq!(
body,
format!(
"--codex-realtime-call-boundary\r\n\
Content-Disposition: form-data; name=\"sdp\"\r\n\
Content-Type: application/sdp\r\n\
\r\n\
v=offer\r\n\
\r\n\
--codex-realtime-call-boundary\r\n\
Content-Disposition: form-data; name=\"session\"\r\n\
Content-Type: application/json\r\n\
\r\n\
{session}\r\n\
--codex-realtime-call-boundary--\r\n"
)
);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> Result<()> {
if std::env::var_os(REALTIME_CONVERSATION_TEST_SUBPROCESS_ENV_VAR).is_none() {
@@ -324,6 +450,7 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() ->
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -383,6 +510,7 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -466,6 +594,7 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -506,6 +635,7 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -594,6 +724,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "old".to_string(),
session_id: Some("conv_old".to_string()),
transport: None,
}))
.await?;
wait_for_event_match(&test.codex, |msg| match msg {
@@ -610,6 +741,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "new".to_string(),
session_id: Some("conv_new".to_string()),
transport: None,
}))
.await?;
wait_for_event_match(&test.codex, |msg| match msg {
@@ -696,6 +828,7 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -750,6 +883,7 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() ->
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "prompt from op".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -812,6 +946,7 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() -
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "prompt from op".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -872,6 +1007,7 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() ->
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "prompt from op".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -925,6 +1061,7 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -978,6 +1115,7 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<()
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1029,6 +1167,7 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() ->
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1113,6 +1252,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1239,6 +1379,7 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() ->
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1380,6 +1521,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1474,6 +1616,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1566,6 +1709,7 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() -
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1665,6 +1809,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio(
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1777,6 +1922,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -1919,6 +2065,7 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
@@ -2045,6 +2192,7 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> {
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;
let _ = wait_for_event_match(&test.codex, |msg| match msg {
@@ -2186,6 +2334,7 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio()
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
transport: None,
}))
.await?;