Mirror user text into realtime (#17520)

- Let typed user messages submit while realtime is active and mirror accepted text into the realtime text stream. - Add integration coverage and snapshot for outbound realtime text.
2026-04-26 23:55:25 +00:00 · 2026-04-12 15:03:14 -07:00
parent cb870a169a
commit d840b247d7
7 changed files with 200 additions and 102 deletions
--- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs
+++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs
@@ -661,7 +661,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
    let connections = realtime_server.connections();
    assert_eq!(connections.len(), 1);
    let connection = &connections[0];
-    assert_eq!(connection.len(), 4);
+    assert_eq!(connection.len(), 3);
    assert_eq!(
        connection[0].body_json()["type"].as_str(),
        Some("session.update")
@@ -679,10 +679,6 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
            .as_str()
            .context("expected websocket request type")?
            .to_string(),
-        connection[3].body_json()["type"]
-            .as_str()
-            .context("expected websocket request type")?
-            .to_string(),
    ];
    request_types.sort();
    assert_eq!(
@@ -690,7 +686,6 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
        [
            "conversation.item.create".to_string(),
            "input_audio_buffer.append".to_string(),
-            "response.create".to_string(),
        ]
    );

@@ -1153,7 +1148,6 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
                    "samples_per_channel": 512
                }),
            ],
-            vec![],
        ])]),
    )
    .await?;
@@ -1185,7 +1179,6 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
    let requests = [
        harness.sideband_outbound_request(/*request_index*/ 1).await,
        harness.sideband_outbound_request(/*request_index*/ 2).await,
-        harness.sideband_outbound_request(/*request_index*/ 3).await,
    ];
    assert!(
        requests
@@ -1208,13 +1201,12 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
    Ok(())
 }

-/// Regression coverage for Realtime V2's single-active-response rule.
+/// Regression coverage for Realtime V2 text input while a response is active.
 ///
-/// The Realtime API rejects a new `response.create` while a default response is
-/// still active, so the input task should queue the second create and flush it
-/// only after the server sends `response.done` for the active response.
+/// Text input is append-only, so app-server should send the user message without
+/// requesting a new realtime response.
 #[tokio::test]
-async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Result<()> {
+async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Result<()> {
    skip_if_no_network!(Ok(()));

    // Phase 1: script a server-side response that becomes active after the first
@@ -1224,7 +1216,6 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
        no_main_loop_responses(),
        realtime_sideband(vec![realtime_sideband_connection(vec![
            vec![session_updated("sess_v2_response_queue")],
-            vec![],
            vec![
                json!({
                    "type": "response.created",
@@ -1240,7 +1231,6 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
                "type": "response.done",
                "response": { "id": "resp_active" }
            })],
-            vec![],
        ])]),
    )
    .await?;
@@ -1253,15 +1243,14 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
    // notifications; they are the protocol frames app-server sends upstream.
    assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;

-    // Phase 2: send the first text turn. It is safe to emit `response.create`
-    // immediately because no default response is active yet.
+    // Phase 2: send the first text turn. Text input is append-only, so this
+    // sends only the user text item.
    let thread_id = started.started.thread_id.clone();
    harness.append_text(thread_id.clone(), "first").await?;
    assert_v2_user_text_item(
        &harness.sideband_outbound_request(/*request_index*/ 1).await,
        "first",
    );
-    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);
    let transcript = harness
        .read_notification::<ThreadRealtimeTranscriptUpdatedNotification>(
            "thread/realtime/transcriptUpdated",
@@ -1270,39 +1259,28 @@ async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Res
    assert_eq!(transcript.text, "active response started");

    // Phase 3: send a second text turn while `resp_active` is still open. The
-    // user message must reach realtime, but `response.create` must not be sent
-    // yet or the Realtime API rejects it as an active-response conflict.
+    // user message must reach realtime without requesting another response.
    harness.append_text(thread_id.clone(), "second").await?;
    assert_v2_user_text_item(
-        &harness.sideband_outbound_request(/*request_index*/ 3).await,
+        &harness.sideband_outbound_request(/*request_index*/ 2).await,
        "second",
    );

-    // Phase 4: the audio input causes the scripted sideband stream to send
-    // `response.done`, which clears the active response and flushes the queued
-    // `response.create` for the second text turn.
+    // Phase 4: audio still forwards normally after text input.
    harness.append_audio(thread_id).await?;

-    // This is the negative check: if the second text turn had emitted
-    // `response.create` immediately, request 4 would be that create instead of
-    // the audio append.
-    let audio = harness.sideband_outbound_request(/*request_index*/ 4).await;
+    let audio = harness.sideband_outbound_request(/*request_index*/ 3).await;
    assert_eq!(audio["type"], "input_audio_buffer.append");
    assert_eq!(audio["audio"], "BQYH");
-    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 5).await);

    harness.shutdown().await;
    Ok(())
 }

-/// Regression coverage for the same queued `response.create` path when the
-/// active Realtime V2 response is cancelled instead of completed.
-///
-/// `response.cancelled` should clear the active-response guard exactly like
-/// `response.done`, so a text turn queued during the active response still gets
-/// one deferred `response.create`.
+/// Regression coverage for append-only Realtime V2 text input when the active
+/// response is cancelled instead of completed.
 #[tokio::test]
-async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelled() -> Result<()> {
+async fn webrtc_v2_text_input_is_append_only_when_response_is_cancelled() -> Result<()> {
    skip_if_no_network!(Ok(()));

    // Phase 1: script a server-side response that becomes active after the first
@@ -1312,7 +1290,6 @@ async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelle
        no_main_loop_responses(),
        realtime_sideband(vec![realtime_sideband_connection(vec![
            vec![session_updated("sess_v2_response_cancel_queue")],
-            vec![],
            vec![json!({
                "type": "response.created",
                "response": { "id": "resp_cancelled" }
@@ -1322,7 +1299,6 @@ async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelle
                "type": "response.cancelled",
                "response": { "id": "resp_cancelled" }
            })],
-            vec![],
        ])]),
    )
    .await?;
@@ -1331,36 +1307,29 @@ async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelle
    assert_eq!(started.started.version, RealtimeConversationVersion::V2);
    assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;

-    // Phase 2: send the first text turn. It is safe to emit `response.create`
-    // immediately because no default response is active yet.
+    // Phase 2: send the first text turn. Text input is append-only, so this
+    // sends only the user text item.
    let thread_id = started.started.thread_id.clone();
    harness.append_text(thread_id.clone(), "first").await?;
    assert_v2_user_text_item(
        &harness.sideband_outbound_request(/*request_index*/ 1).await,
        "first",
    );
-    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);

    // Phase 3: send a second text turn while `resp_cancelled` is still open.
-    // The user message must reach realtime, but `response.create` stays queued.
+    // The user message must reach realtime without requesting another response.
    harness.append_text(thread_id.clone(), "second").await?;
    assert_v2_user_text_item(
-        &harness.sideband_outbound_request(/*request_index*/ 3).await,
+        &harness.sideband_outbound_request(/*request_index*/ 2).await,
        "second",
    );

-    // Phase 4: the audio input causes the scripted sideband stream to send
-    // `response.cancelled`, which clears the active response and flushes the
-    // queued `response.create` for the second text turn.
+    // Phase 4: audio still forwards normally after text input.
    harness.append_audio(thread_id).await?;

-    // This is the negative check: if the second text turn had emitted
-    // `response.create` immediately, request 4 would be that create instead of
-    // the audio append.
-    let audio = harness.sideband_outbound_request(/*request_index*/ 4).await;
+    let audio = harness.sideband_outbound_request(/*request_index*/ 3).await;
    assert_eq!(audio["type"], "input_audio_buffer.append");
    assert_eq!(audio["audio"], "BQYH");
-    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 5).await);

    harness.shutdown().await;
    Ok(())