Wire realtime api to core (#12268)

- Introduce `RealtimeConversationManager` for realtime API management 
- Add `op::conversation` to start conversation, insert audio, insert
text, and close conversation.
- emit conversation lifecycle and realtime events.
- Move shared realtime payload types into codex-protocol and add core
e2e websocket tests for start/replace/transport-close paths.

Things to consider:
- Should we use the same `op::` and `Events` channel to carry audio? I
think we should try this simple approach and later we can create
separate one if the channels got congested.
- Sending text updates to the client: we can start simple and later
restrict that.
- Provider auth isn't wired for now intentionally
This commit is contained in:
Ahmed Ibrahim
2026-02-20 19:06:35 -08:00
committed by GitHub
parent 936e744c93
commit 6817f0be8a
28 changed files with 2102 additions and 42 deletions

View File

@@ -0,0 +1,360 @@
use anyhow::Result;
use codex_core::protocol::CodexErrorInfo;
use codex_core::protocol::ConversationAudioParams;
use codex_core::protocol::ConversationStartParams;
use codex_core::protocol::ConversationTextParams;
use codex_core::protocol::ErrorEvent;
use codex_core::protocol::EventMsg;
use codex_core::protocol::Op;
use codex_core::protocol::RealtimeAudioFrame;
use codex_core::protocol::RealtimeConversationRealtimeEvent;
use codex_core::protocol::RealtimeEvent;
use core_test_support::responses::start_websocket_server;
use core_test_support::skip_if_no_network;
use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event_match;
use pretty_assertions::assert_eq;
use serde_json::json;
use std::time::Duration;
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_websocket_server(vec![
vec![],
vec![
vec![json!({
"type": "session.created",
"session": { "id": "sess_1" }
})],
vec![],
vec![
json!({
"type": "response.output_audio.delta",
"delta": "AQID",
"sample_rate": 24000,
"num_channels": 1
}),
json!({
"type": "conversation.item.added",
"item": {
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": "hi"}]
}
}),
],
],
])
.await;
let mut builder = test_codex();
let test = builder.build_with_websocket_server(&server).await?;
assert!(server.wait_for_handshakes(1, Duration::from_secs(2)).await);
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
}))
.await?;
let started = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationStarted(started) => Some(Ok(started.clone())),
EventMsg::Error(err) => Some(Err(err.clone())),
_ => None,
})
.await
.unwrap_or_else(|err: ErrorEvent| panic!("conversation start failed: {err:?}"));
assert!(started.session_id.is_some());
let session_created = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
payload: RealtimeEvent::SessionCreated { session_id },
}) => Some(session_id.clone()),
_ => None,
})
.await;
assert_eq!(session_created, "sess_1");
test.codex
.submit(Op::RealtimeConversationAudio(ConversationAudioParams {
frame: RealtimeAudioFrame {
data: "AQID".to_string(),
sample_rate: 24000,
num_channels: 1,
samples_per_channel: Some(480),
},
}))
.await?;
test.codex
.submit(Op::RealtimeConversationText(ConversationTextParams {
text: "hello".to_string(),
}))
.await?;
let audio_out = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
payload: RealtimeEvent::AudioOut(frame),
}) => Some(frame.clone()),
_ => None,
})
.await;
assert_eq!(audio_out.data, "AQID");
let connections = server.connections();
assert_eq!(connections.len(), 2);
let connection = &connections[1];
assert_eq!(connection.len(), 3);
assert_eq!(
connection[0].body_json()["type"].as_str(),
Some("session.create")
);
assert_eq!(
connection[0].body_json()["session"]["conversation_id"]
.as_str()
.expect("session.create conversation_id"),
started
.session_id
.as_deref()
.expect("started session id should be present")
);
let request_types = [
connection[1].body_json()["type"]
.as_str()
.expect("request type")
.to_string(),
connection[2].body_json()["type"]
.as_str()
.expect("request type")
.to_string(),
];
assert_eq!(
request_types,
[
"conversation.item.create".to_string(),
"response.input_audio.delta".to_string(),
]
);
test.codex.submit(Op::RealtimeConversationClose).await?;
let closed = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationClosed(closed) => Some(closed.clone()),
_ => None,
})
.await;
assert!(matches!(
closed.reason.as_deref(),
Some("requested" | "transport_closed")
));
server.shutdown().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_transport_close_emits_closed_event() -> Result<()> {
skip_if_no_network!(Ok(()));
let session_created = vec![json!({
"type": "session.created",
"session": { "id": "sess_1" }
})];
let server = start_websocket_server(vec![vec![], vec![session_created]]).await;
let mut builder = test_codex();
let test = builder.build_with_websocket_server(&server).await?;
assert!(server.wait_for_handshakes(1, Duration::from_secs(2)).await);
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "backend prompt".to_string(),
session_id: None,
}))
.await?;
let started = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationStarted(started) => Some(Ok(started.clone())),
EventMsg::Error(err) => Some(Err(err.clone())),
_ => None,
})
.await
.unwrap_or_else(|err: ErrorEvent| panic!("conversation start failed: {err:?}"));
assert!(started.session_id.is_some());
let session_created = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
payload: RealtimeEvent::SessionCreated { session_id },
}) => Some(session_id.clone()),
_ => None,
})
.await;
assert_eq!(session_created, "sess_1");
let closed = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationClosed(closed) => Some(closed.clone()),
_ => None,
})
.await;
assert_eq!(closed.reason.as_deref(), Some("transport_closed"));
server.shutdown().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_audio_before_start_emits_error() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_websocket_server(vec![]).await;
let mut builder = test_codex();
let test = builder.build_with_websocket_server(&server).await?;
test.codex
.submit(Op::RealtimeConversationAudio(ConversationAudioParams {
frame: RealtimeAudioFrame {
data: "AQID".to_string(),
sample_rate: 24000,
num_channels: 1,
samples_per_channel: Some(480),
},
}))
.await?;
let err = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::Error(err) => Some(err.clone()),
_ => None,
})
.await;
assert_eq!(err.codex_error_info, Some(CodexErrorInfo::BadRequest));
assert_eq!(err.message, "conversation is not running");
server.shutdown().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_text_before_start_emits_error() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_websocket_server(vec![]).await;
let mut builder = test_codex();
let test = builder.build_with_websocket_server(&server).await?;
test.codex
.submit(Op::RealtimeConversationText(ConversationTextParams {
text: "hello".to_string(),
}))
.await?;
let err = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::Error(err) => Some(err.clone()),
_ => None,
})
.await;
assert_eq!(err.codex_error_info, Some(CodexErrorInfo::BadRequest));
assert_eq!(err.message, "conversation is not running");
server.shutdown().await;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn conversation_second_start_replaces_runtime() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_websocket_server(vec![
vec![],
vec![vec![json!({
"type": "session.created",
"session": { "id": "sess_old" }
})]],
vec![
vec![json!({
"type": "session.created",
"session": { "id": "sess_new" }
})],
vec![json!({
"type": "response.output_audio.delta",
"delta": "AQID",
"sample_rate": 24000,
"num_channels": 1
})],
],
])
.await;
let mut builder = test_codex();
let test = builder.build_with_websocket_server(&server).await?;
assert!(server.wait_for_handshakes(1, Duration::from_secs(2)).await);
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "old".to_string(),
session_id: Some("conv_old".to_string()),
}))
.await?;
wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
payload: RealtimeEvent::SessionCreated { session_id },
}) if session_id == "sess_old" => Some(Ok(())),
EventMsg::Error(err) => Some(Err(err.clone())),
_ => None,
})
.await
.unwrap_or_else(|err: ErrorEvent| panic!("first conversation start failed: {err:?}"));
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: "new".to_string(),
session_id: Some("conv_new".to_string()),
}))
.await?;
wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
payload: RealtimeEvent::SessionCreated { session_id },
}) if session_id == "sess_new" => Some(Ok(())),
EventMsg::Error(err) => Some(Err(err.clone())),
_ => None,
})
.await
.unwrap_or_else(|err: ErrorEvent| panic!("second conversation start failed: {err:?}"));
test.codex
.submit(Op::RealtimeConversationAudio(ConversationAudioParams {
frame: RealtimeAudioFrame {
data: "AQID".to_string(),
sample_rate: 24000,
num_channels: 1,
samples_per_channel: Some(480),
},
}))
.await?;
let _ = wait_for_event_match(&test.codex, |msg| match msg {
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
payload: RealtimeEvent::AudioOut(frame),
}) if frame.data == "AQID" => Some(()),
_ => None,
})
.await;
let connections = server.connections();
assert_eq!(connections.len(), 3);
assert_eq!(connections[1].len(), 1);
assert_eq!(
connections[1][0].body_json()["session"]["conversation_id"].as_str(),
Some("conv_old")
);
assert_eq!(connections[2].len(), 2);
assert_eq!(
connections[2][0].body_json()["session"]["conversation_id"].as_str(),
Some("conv_new")
);
assert_eq!(
connections[2][1].body_json()["type"].as_str(),
Some("response.input_audio.delta")
);
server.shutdown().await;
Ok(())
}