mirror of
https://github.com/openai/codex.git
synced 2026-04-30 09:26:44 +00:00
Update realtime handoff transcript handling (#18597)
## Summary This PR aims to improve integration between the realtime model and the codex agent by sharing more context with each other. In particular, we now share full realtime conversation transcript deltas in addition to the delegation message. realtime_conversation.rs now turns a handoff into: ``` <realtime_delegation> <input>...</input> <transcript_delta>...</transcript_delta> </realtime_delegation> ``` ## Implementation notes The transcript is accumulated in the realtime websocket layer as parsed realtime events arrive. When a background-agent handoff is requested, the current transcript snapshot is copied onto the handoff event and then serialized by `realtime_conversation.rs` into the hidden realtime delegation envelope that Codex receives as user-turn context. For Realtime V2, the session now explicitly enables input audio transcription, and the parser handles the relevant input/output transcript completion events so the snapshot includes both user speech and realtime model responses. The delegation `<input>` remains the actual handoff request, while `<transcript_delta>` carries the surrounding conversation history for context. Reviewers should note that the transcript payload is intended for Codex context sharing, not UI rendering. The realtime delegation envelope should stay hidden from the user-facing transcript surface, while still being included in the background-agent turn so Codex can answer with the same conversational context the realtime model had.
This commit is contained in:
@@ -12,6 +12,9 @@ use codex_api::RealtimeSessionMode;
|
||||
use codex_api::RealtimeWebsocketClient;
|
||||
use codex_api::RetryConfig;
|
||||
use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use codex_protocol::protocol::RealtimeTranscriptDelta;
|
||||
use codex_protocol::protocol::RealtimeTranscriptDone;
|
||||
use codex_protocol::protocol::RealtimeTranscriptEntry;
|
||||
use codex_protocol::protocol::RealtimeVoice;
|
||||
use futures::SinkExt;
|
||||
use futures::StreamExt;
|
||||
@@ -493,6 +496,46 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() {
|
||||
let first_json: Value = serde_json::from_str(&first).expect("json");
|
||||
assert_eq!(first_json["type"], "session.update");
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "conversation.item.input_audio_transcription.completed",
|
||||
"transcript": "delegate now"
|
||||
})
|
||||
.to_string()
|
||||
.into(),
|
||||
))
|
||||
.await
|
||||
.expect("send input transcript");
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "response.output_audio_transcript.delta",
|
||||
"delta": "secret context"
|
||||
})
|
||||
.to_string()
|
||||
.into(),
|
||||
))
|
||||
.await
|
||||
.expect("send output transcript");
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "conversation.item.created",
|
||||
"item": {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "input_text",
|
||||
"text": "<realtime_collaboration_update><voice_policy>silent_delegate</voice_policy></realtime_collaboration_update>"
|
||||
}]
|
||||
}
|
||||
})
|
||||
.to_string()
|
||||
.into(),
|
||||
))
|
||||
.await
|
||||
.expect("send control item echo");
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "conversation.item.done",
|
||||
@@ -530,6 +573,37 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() {
|
||||
.await
|
||||
.expect("connect");
|
||||
|
||||
let event = connection
|
||||
.next_event()
|
||||
.await
|
||||
.expect("next event")
|
||||
.expect("event");
|
||||
assert_eq!(
|
||||
event,
|
||||
RealtimeEvent::InputTranscriptDone(RealtimeTranscriptDone {
|
||||
text: "delegate now".to_string()
|
||||
})
|
||||
);
|
||||
|
||||
let event = connection
|
||||
.next_event()
|
||||
.await
|
||||
.expect("next event")
|
||||
.expect("event");
|
||||
assert_eq!(
|
||||
event,
|
||||
RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta {
|
||||
delta: "secret context".to_string()
|
||||
})
|
||||
);
|
||||
|
||||
let event = connection
|
||||
.next_event()
|
||||
.await
|
||||
.expect("next event")
|
||||
.expect("event");
|
||||
assert!(matches!(event, RealtimeEvent::ConversationItemAdded(_)));
|
||||
|
||||
let event = connection
|
||||
.next_event()
|
||||
.await
|
||||
@@ -541,7 +615,16 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() {
|
||||
handoff_id: "call_123".to_string(),
|
||||
item_id: "item_123".to_string(),
|
||||
input_transcript: "delegate now".to_string(),
|
||||
active_transcript: Vec::new(),
|
||||
active_transcript: vec![
|
||||
RealtimeTranscriptEntry {
|
||||
role: "user".to_string(),
|
||||
text: "delegate now".to_string(),
|
||||
},
|
||||
RealtimeTranscriptEntry {
|
||||
role: "assistant".to_string(),
|
||||
text: "secret context".to_string(),
|
||||
},
|
||||
],
|
||||
})
|
||||
);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user