Queue Realtime V2 response.create while active (#17306)

Builds on #17264. - queues Realtime V2 `response.create` while an active response is open, then flushes it after `response.done` or `response.cancelled` - requests `response.create` after background agent final output and steering acknowledgements - adds app-server integration coverage for all `response.create` paths Validation: - `just fmt` - `cargo check -p codex-app-server --tests` - `git diff --check` - CI green --------- Co-authored-by: Codex <noreply@openai.com>
2026-04-26 23:55:25 +00:00 · 2026-04-10 09:09:13 -07:00
parent 88165e179a
commit 2e81eac004
7 changed files with 454 additions and 40 deletions
--- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs
+++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs
@@ -71,6 +71,8 @@ use wiremock::matchers::path_regex;

 const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
 const STARTUP_CONTEXT_HEADER: &str = "Startup context from Codex.";
+const V2_STEERING_ACKNOWLEDGEMENT: &str =
+    "This was sent to steer the previous background agent task.";

 #[derive(Debug, Clone, Copy)]
 enum StartupContextConfig<'a> {
@@ -329,6 +331,8 @@ impl RealtimeE2eHarness {
        read_notification(&mut self.mcp, method).await
    }

+    /// Returns the nth JSON message app-server wrote to the fake Realtime API
+    /// sideband websocket.
    async fn sideband_outbound_request(&self, request_index: usize) -> Value {
        self.realtime_server
            .wait_for_request(/*connection_index*/ 0, request_index)
@@ -1204,6 +1208,169 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu
    Ok(())
 }

+/// Regression coverage for Realtime V2's single-active-response rule.
+///
+/// The Realtime API rejects a new `response.create` while a default response is
+/// still active, so the input task should queue the second create and flush it
+/// only after the server sends `response.done` for the active response.
+#[tokio::test]
+async fn webrtc_v2_queues_text_response_create_while_response_is_active() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    // Phase 1: script a server-side response that becomes active after the first
+    // user text turn, then finishes only after a later audio input.
+    let mut harness = RealtimeE2eHarness::new(
+        RealtimeTestVersion::V2,
+        no_main_loop_responses(),
+        realtime_sideband(vec![realtime_sideband_connection(vec![
+            vec![session_updated("sess_v2_response_queue")],
+            vec![],
+            vec![
+                json!({
+                    "type": "response.created",
+                    "response": { "id": "resp_active" }
+                }),
+                json!({
+                    "type": "response.output_text.delta",
+                    "delta": "active response started"
+                }),
+            ],
+            vec![],
+            vec![json!({
+                "type": "response.done",
+                "response": { "id": "resp_active" }
+            })],
+            vec![],
+        ])]),
+    )
+    .await?;
+
+    let started = harness.start_webrtc_realtime("v=offer\r\n").await?;
+    assert_eq!(started.started.version, RealtimeConversationVersion::V2);
+
+    // From here on, `sideband_outbound_request(n)` reads outbound messages to
+    // the fake Realtime API sideband websocket. These are not client-facing
+    // notifications; they are the protocol frames app-server sends upstream.
+    assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;
+
+    // Phase 2: send the first text turn. It is safe to emit `response.create`
+    // immediately because no default response is active yet.
+    let thread_id = started.started.thread_id.clone();
+    harness.append_text(thread_id.clone(), "first").await?;
+    assert_v2_user_text_item(
+        &harness.sideband_outbound_request(/*request_index*/ 1).await,
+        "first",
+    );
+    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);
+    let transcript = harness
+        .read_notification::<ThreadRealtimeTranscriptUpdatedNotification>(
+            "thread/realtime/transcriptUpdated",
+        )
+        .await?;
+    assert_eq!(transcript.text, "active response started");
+
+    // Phase 3: send a second text turn while `resp_active` is still open. The
+    // user message must reach realtime, but `response.create` must not be sent
+    // yet or the Realtime API rejects it as an active-response conflict.
+    harness.append_text(thread_id.clone(), "second").await?;
+    assert_v2_user_text_item(
+        &harness.sideband_outbound_request(/*request_index*/ 3).await,
+        "second",
+    );
+
+    // Phase 4: the audio input causes the scripted sideband stream to send
+    // `response.done`, which clears the active response and flushes the queued
+    // `response.create` for the second text turn.
+    harness.append_audio(thread_id).await?;
+
+    // This is the negative check: if the second text turn had emitted
+    // `response.create` immediately, request 4 would be that create instead of
+    // the audio append.
+    let audio = harness.sideband_outbound_request(/*request_index*/ 4).await;
+    assert_eq!(audio["type"], "input_audio_buffer.append");
+    assert_eq!(audio["audio"], "BQYH");
+    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 5).await);
+
+    harness.shutdown().await;
+    Ok(())
+}
+
+/// Regression coverage for the same queued `response.create` path when the
+/// active Realtime V2 response is cancelled instead of completed.
+///
+/// `response.cancelled` should clear the active-response guard exactly like
+/// `response.done`, so a text turn queued during the active response still gets
+/// one deferred `response.create`.
+#[tokio::test]
+async fn webrtc_v2_flushes_queued_text_response_create_when_response_is_cancelled() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    // Phase 1: script a server-side response that becomes active after the first
+    // text turn, then is cancelled only after a later audio input.
+    let mut harness = RealtimeE2eHarness::new(
+        RealtimeTestVersion::V2,
+        no_main_loop_responses(),
+        realtime_sideband(vec![realtime_sideband_connection(vec![
+            vec![session_updated("sess_v2_response_cancel_queue")],
+            vec![],
+            vec![json!({
+                "type": "response.created",
+                "response": { "id": "resp_cancelled" }
+            })],
+            vec![],
+            vec![json!({
+                "type": "response.cancelled",
+                "response": { "id": "resp_cancelled" }
+            })],
+            vec![],
+        ])]),
+    )
+    .await?;
+
+    let started = harness.start_webrtc_realtime("v=offer\r\n").await?;
+    assert_eq!(started.started.version, RealtimeConversationVersion::V2);
+    assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;
+
+    // Phase 2: send the first text turn. It is safe to emit `response.create`
+    // immediately because no default response is active yet.
+    let thread_id = started.started.thread_id.clone();
+    harness.append_text(thread_id.clone(), "first").await?;
+    assert_v2_user_text_item(
+        &harness.sideband_outbound_request(/*request_index*/ 1).await,
+        "first",
+    );
+    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);
+
+    // Phase 3: send a second text turn while `resp_cancelled` is still open.
+    // The user message must reach realtime, but `response.create` stays queued.
+    harness.append_text(thread_id.clone(), "second").await?;
+    assert_v2_user_text_item(
+        &harness.sideband_outbound_request(/*request_index*/ 3).await,
+        "second",
+    );
+
+    // Phase 4: the audio input causes the scripted sideband stream to send
+    // `response.cancelled`, which clears the active response and flushes the
+    // queued `response.create` for the second text turn.
+    harness.append_audio(thread_id).await?;
+
+    // This is the negative check: if the second text turn had emitted
+    // `response.create` immediately, request 4 would be that create instead of
+    // the audio append.
+    let audio = harness.sideband_outbound_request(/*request_index*/ 4).await;
+    assert_eq!(audio["type"], "input_audio_buffer.append");
+    assert_eq!(audio["audio"], "BQYH");
+    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 5).await);
+
+    harness.shutdown().await;
+    Ok(())
+}
+
+/// Regression coverage for the Realtime V2 background-agent final-output path.
+///
+/// Once the background agent finishes, app-server sends the final function-call
+/// output to realtime and then requests a new `response.create` so realtime can
+/// react to that final output.
 #[tokio::test]
 async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_output() -> Result<()>
 {
@@ -1223,6 +1390,7 @@ async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_out
            ],
            vec![],
            vec![],
+            vec![],
        ])]),
    )
    .await?;
@@ -1240,8 +1408,8 @@ async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_out
        .await?;
    assert_eq!(turn_completed.thread_id, harness.thread_id);

-    // Phase 3: assert the delegated prompt went to Responses and the result returned as exactly one
-    // v2 function-call output event on the sideband.
+    // Phase 3: assert the delegated prompt went to Responses and the result
+    // returned as exactly one v2 function-call output event on the sideband.
    let requests = harness.main_loop_responses_requests().await?;
    assert_eq!(requests.len(), 1);
    assert!(
@@ -1260,6 +1428,99 @@ async fn webrtc_v2_background_agent_tool_call_delegates_and_returns_function_out
        1
    );

+    // Phase 4: after the final function-call output, realtime needs an explicit
+    // `response.create` to produce the next user-visible response.
+    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 3).await);
+
+    harness.shutdown().await;
+    Ok(())
+}
+
+/// Regression coverage for Realtime V2 steering while a background-agent task is
+/// already active.
+///
+/// The second background-agent tool call is treated as guidance for the active
+/// task. App-server acknowledges that steering message to realtime and then
+/// emits `response.create` so realtime can speak that acknowledgement.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn webrtc_v2_background_agent_steering_ack_requests_response_create() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    // Phase 1: gate the delegated Responses turn from the first tool call so
+    // the background-agent handoff stays active while realtime sends a second
+    // tool call that should steer the active task.
+    let main_loop_responses_server = responses::start_mock_server().await;
+    let (gate_completed_tx, gate_completed_rx) = mpsc::channel();
+    let gated_response = responses::sse(vec![
+        responses::ev_response_created("resp-1"),
+        responses::ev_assistant_message("msg-1", "first task finished"),
+        responses::ev_completed("resp-1"),
+    ]);
+    Mock::given(method("POST"))
+        .and(path_regex(".*/responses$"))
+        .respond_with(GatedSseResponse {
+            gate_rx: Mutex::new(Some(gate_completed_rx)),
+            response: gated_response,
+        })
+        .expect(2)
+        .mount(&main_loop_responses_server)
+        .await;
+
+    let mut harness = RealtimeE2eHarness::new_with_main_loop_responses_server(
+        RealtimeTestVersion::V2,
+        main_loop_responses_server,
+        realtime_sideband(vec![realtime_sideband_connection(vec![
+            vec![
+                session_updated("sess_v2_steering_ack"),
+                v2_background_agent_tool_call("call_active", "start a task"),
+                v2_background_agent_tool_call("call_steer", "steer the active task"),
+            ],
+            vec![],
+            vec![],
+            vec![],
+            vec![],
+        ])]),
+    )
+    .await?;
+
+    let started = harness.start_webrtc_realtime("v=offer\r\n").await?;
+    assert_eq!(started.started.version, RealtimeConversationVersion::V2);
+    assert_v2_session_update(&harness.sideband_outbound_request(/*request_index*/ 0).await)?;
+    let turn_started = harness
+        .read_notification::<TurnStartedNotification>("turn/started")
+        .await?;
+    assert_eq!(turn_started.thread_id, harness.thread_id);
+
+    // Phase 2: the second tool call happens while `call_active` is still
+    // running, so app-server sends a steering acknowledgement as a function-call
+    // output for the second call.
+    assert_v2_function_call_output(
+        &harness.sideband_outbound_request(/*request_index*/ 1).await,
+        "call_steer",
+        V2_STEERING_ACKNOWLEDGEMENT,
+    );
+
+    // Phase 3: realtime needs a `response.create` after the steering
+    // acknowledgement so it can surface that acknowledgement to the user.
+    assert_v2_response_create(&harness.sideband_outbound_request(/*request_index*/ 2).await);
+
+    // Phase 4: release the gated delegated turn. Codex should then continue
+    // the same run with the steering text included in the follow-up Responses
+    // request, proving realtime did not merely acknowledge and drop it.
+    let _ = gate_completed_tx.send(());
+    let turn_completed = harness
+        .read_notification::<TurnCompletedNotification>("turn/completed")
+        .await?;
+    assert_eq!(turn_completed.thread_id, harness.thread_id);
+
+    let requests = harness.main_loop_responses_requests().await?;
+    assert_eq!(requests.len(), 2);
+    assert!(
+        response_request_contains_text(&requests[1], "steer the active task"),
+        "follow-up Responses request should contain steering prompt: {}",
+        requests[1]
+    );
+
    harness.shutdown().await;
    Ok(())
 }
@@ -1714,6 +1975,32 @@ fn assert_v2_progress_update(request: &Value, expected_text: &str) {
    );
 }

+fn assert_v2_user_text_item(request: &Value, expected_text: &str) {
+    assert_eq!(
+        request,
+        &json!({
+            "type": "conversation.item.create",
+            "item": {
+                "type": "message",
+                "role": "user",
+                "content": [{
+                    "type": "input_text",
+                    "text": expected_text
+                }]
+            }
+        })
+    );
+}
+
+fn assert_v2_response_create(request: &Value) {
+    assert_eq!(
+        request,
+        &json!({
+            "type": "response.create"
+        })
+    );
+}
+
 fn assert_v1_session_update(request: &Value) -> Result<()> {
    assert_eq!(request["type"].as_str(), Some("session.update"));
    assert_eq!(request["session"]["type"].as_str(), Some("quicksilver"));