Mirror user text into realtime (#17520)

- Let typed user messages submit while realtime is active and mirror
accepted text into the realtime text stream.
- Add integration coverage and snapshot for outbound realtime text.
This commit is contained in:
Ahmed Ibrahim
2026-04-12 15:03:14 -07:00
committed by GitHub
parent cb870a169a
commit d840b247d7
7 changed files with 200 additions and 102 deletions

View File

@@ -661,7 +661,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
let connections = realtime_server.connections();
assert_eq!(connections.len(), 1);
let connection = &connections[0];
assert_eq!(connection.len(), 4);
assert_eq!(connection.len(), 3);
assert_eq!(
connection[0].body_json()["type"].as_str(),
Some("session.update")
@@ -679,10 +679,6 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
.as_str()
.context("expected websocket request type")?
.to_string(),
connection[3].body_json()["type"]
.as_str()
.context("expected websocket request type")?
.to_string(),
];
request_types.sort();
assert_eq!(
@@ -690,7 +686,6 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
[
"conversation.item.create".to_string(),
"input_audio_buffer.append".to_string(),
"response.create".to_string(),
]
);
@@ -1153,7 +1148,6 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
"samples_per_channel": 512
}),
],
vec![],
])]),
)
.await?;
@@ -1185,7 +1179,6 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
let requests = [
harness.sideband_outbound_request(/*request_index*/ 1).await,
harness.sideband_outbound_request(/*request_index*/ 2).await,
harness.sideband_outbound_request(/*request_index*/ 3).await,
];
assert!(
requests
@@ -1208,13 +1201,12 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
Ok(())
}
/// Regression coverage for Realtime V2's single-active-response rule.
/// Regression coverage for Realtime V2 text input while a response is active.
///
/// The Realtime API rejects a new `response.create` while a default response is
/// still active, so the input task should queue the second create and flush it
/// only after the server sends `response.done` for the active response.
/// Text input is append-only, so app-server should send the user message without
/// requesting a new realtime response.
#[tokio::test]
async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Result<()> {
async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Result<()> {
skip_if_no_network!(Ok(()));
// Phase 1: script a server-side response that becomes active after the first
@@ -1224,7 +1216,6 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
no_main_loop_responses(),
realtime_sideband(vec![realtime_sideband_connection(vec![
vec![session_updated("sess_v2_response_queue")],
vec![],
vec![
json!({
"type": "response.created",
@@ -1240,7 +1231,6 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
"type": "response.done",
"response": { "id": "resp_active" }
})],
vec![],
])]),
)
.await?;
@@ -1253,15 +1243,14 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
// notifications; they are the protocol frames app-server sends upstream.
assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;
// Phase 2: send the first text turn. It is safe to emit `response.create`
// immediately because no default response is active yet.
// Phase 2: send the first text turn. Text input is append-only, so this
// sends only the user text item.
let thread_id = started.started.thread_id.clone();
harness.append_text(thread_id.clone(), "first").await?;
assert_v2_user_text_item(
&harness.sideband_outbound_request(/*request_index*/ 1).await,
"first",
);
assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);
let transcript = harness
.read_notification::<ThreadRealtimeTranscriptUpdatedNotification>(
"thread/realtime/transcriptUpdated",
@@ -1270,39 +1259,28 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
assert_eq!(transcript.text, "active response started");
// Phase 3: send a second text turn while `resp_active` is still open. The
// user message must reach realtime, but `response.create` must not be sent
// yet or the Realtime API rejects it as an active-response conflict.
// user message must reach realtime without requesting another response.
harness.append_text(thread_id.clone(), "second").await?;
assert_v2_user_text_item(
&harness.sideband_outbound_request(/*request_index*/ 3).await,
&harness.sideband_outbound_request(/*request_index*/ 2).await,
"second",
);
// Phase 4: the audio input causes the scripted sideband stream to send
// `response.done`, which clears the active response and flushes the queued
// `response.create` for the second text turn.
// Phase 4: audio still forwards normally after text input.
harness.append_audio(thread_id).await?;
// This is the negative check: if the second text turn had emitted
// `response.create` immediately, request 4 would be that create instead of
// the audio append.
let audio = harness.sideband_outbound_request(/*request_index*/ 4).await;
let audio = harness.sideband_outbound_request(/*request_index*/ 3).await;
assert_eq!(audio["type"], "input_audio_buffer.append");
assert_eq!(audio["audio"], "BQYH");
assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 5).await);
harness.shutdown().await;
Ok(())
}
/// Regression coverage for the same queued `response.create` path when the
/// active Realtime V2 response is cancelled instead of completed.
///
/// `response.cancelled` should clear the active-response guard exactly like
/// `response.done`, so a text turn queued during the active response still gets
/// one deferred `response.create`.
/// Regression coverage for append-only Realtime V2 text input when the active
/// response is cancelled instead of completed.
#[tokio::test]
async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelled() -> Result<()> {
async fn webrtc_v2_text_input_is_append_only_when_response_is_cancelled() -> Result<()> {
skip_if_no_network!(Ok(()));
// Phase 1: script a server-side response that becomes active after the first
@@ -1312,7 +1290,6 @@ async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelle
no_main_loop_responses(),
realtime_sideband(vec![realtime_sideband_connection(vec![
vec![session_updated("sess_v2_response_cancel_queue")],
vec![],
vec![json!({
"type": "response.created",
"response": { "id": "resp_cancelled" }
@@ -1322,7 +1299,6 @@ async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelle
"type": "response.cancelled",
"response": { "id": "resp_cancelled" }
})],
vec![],
])]),
)
.await?;
@@ -1331,36 +1307,29 @@ async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelle
assert_eq!(started.started.version, RealtimeConversationVersion::V2);
assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;
// Phase 2: send the first text turn. It is safe to emit `response.create`
// immediately because no default response is active yet.
// Phase 2: send the first text turn. Text input is append-only, so this
// sends only the user text item.
let thread_id = started.started.thread_id.clone();
harness.append_text(thread_id.clone(), "first").await?;
assert_v2_user_text_item(
&harness.sideband_outbound_request(/*request_index*/ 1).await,
"first",
);
assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);
// Phase 3: send a second text turn while `resp_cancelled` is still open.
// The user message must reach realtime, but `response.create` stays queued.
// The user message must reach realtime without requesting another response.
harness.append_text(thread_id.clone(), "second").await?;
assert_v2_user_text_item(
&harness.sideband_outbound_request(/*request_index*/ 3).await,
&harness.sideband_outbound_request(/*request_index*/ 2).await,
"second",
);
// Phase 4: the audio input causes the scripted sideband stream to send
// `response.cancelled`, which clears the active response and flushes the
// queued `response.create` for the second text turn.
// Phase 4: audio still forwards normally after text input.
harness.append_audio(thread_id).await?;
// This is the negative check: if the second text turn had emitted
// `response.create` immediately, request 4 would be that create instead of
// the audio append.
let audio = harness.sideband_outbound_request(/*request_index*/ 4).await;
let audio = harness.sideband_outbound_request(/*request_index*/ 3).await;
assert_eq!(audio["type"], "input_audio_buffer.append");
assert_eq!(audio["audio"], "BQYH");
assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 5).await);
harness.shutdown().await;
Ok(())