Add realtime output modality and transcript events (#17701)

- Add outputModality to thread/realtime/start and wire text/audio output
selection through app-server, core, API, and TUI.\n- Rename the realtime
transcript delta notification and add a separate transcript done
notification that forwards final text from item done without correlating
it with deltas.
This commit is contained in:
Ahmed Ibrahim
2026-04-14 00:13:13 -07:00
committed by GitHub
parent a6b03a22cc
commit 2f6fc7c137
38 changed files with 711 additions and 77 deletions

View File

@@ -21,6 +21,7 @@ use codex_protocol::protocol::RealtimeAudioFrame;
use codex_protocol::protocol::RealtimeConversationRealtimeEvent;
use codex_protocol::protocol::RealtimeConversationVersion;
use codex_protocol::protocol::RealtimeEvent;
use codex_protocol::protocol::RealtimeOutputModality;
use codex_protocol::protocol::RealtimeVoice;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::SessionSource;
@@ -248,6 +249,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -381,6 +383,7 @@ async fn conversation_start_defaults_to_v2_and_gpt_realtime_1_5() -> Result<()>
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -464,6 +467,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: Some(ConversationStartTransport::Webrtc {
@@ -601,6 +605,7 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() ->
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -662,6 +667,7 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -747,6 +753,7 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -790,6 +797,7 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -880,6 +888,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("old".to_string())),
session_id: Some("conv_old".to_string()),
transport: None,
@@ -898,6 +907,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("new".to_string())),
session_id: Some("conv_new".to_string()),
transport: None,
@@ -987,6 +997,7 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1044,6 +1055,7 @@ async fn conversation_uses_default_realtime_backend_prompt() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: None,
session_id: None,
transport: None,
@@ -1109,6 +1121,7 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu
] {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt,
session_id: None,
transport: None,
@@ -1167,6 +1180,7 @@ async fn conversation_uses_explicit_start_voice() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1217,6 +1231,7 @@ async fn conversation_uses_configured_realtime_voice() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1255,6 +1270,7 @@ async fn conversation_rejects_voice_for_wrong_realtime_version() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1298,6 +1314,7 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() ->
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("prompt from op".to_string())),
session_id: None,
transport: None,
@@ -1363,6 +1380,7 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() -
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("prompt from op".to_string())),
session_id: None,
transport: None,
@@ -1426,6 +1444,7 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() ->
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("prompt from op".to_string())),
session_id: None,
transport: None,
@@ -1482,6 +1501,7 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1593,6 +1613,7 @@ async fn conversation_startup_context_current_thread_selects_many_turns_by_budge
codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1697,6 +1718,7 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<()
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1751,6 +1773,7 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() ->
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1826,6 +1849,7 @@ async fn conversation_user_text_turn_is_sent_to_realtime_when_active() -> Result
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -1948,6 +1972,7 @@ async fn conversation_user_text_turn_is_capped_when_mirrored_to_realtime() -> Re
// active WebSocket session.
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2075,6 +2100,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2204,6 +2230,7 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() ->
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2348,6 +2375,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2445,6 +2473,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2540,6 +2569,7 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() -
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2642,6 +2672,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio(
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2757,6 +2788,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -2902,6 +2934,7 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -3032,6 +3065,7 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> {
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,
@@ -3183,6 +3217,7 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio()
test.codex
.submit(Op::RealtimeConversationStart(ConversationStartParams {
output_modality: RealtimeOutputModality::Audio,
prompt: Some(Some("backend prompt".to_string())),
session_id: None,
transport: None,