[stack 4/4] Reduce realtime self-interruptions during playback (#14827)

## Stack Position
4/4. Top-of-stack sibling built on #14830.

## Base
- #14830

## Sibling
- #14829

## Scope
- Gate low-level mic chunks while speaker playback is active, while
still allowing spoken barge-in.

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-16 22:19:51 -07:00
committed by GitHub
parent 79f476e47d
commit 32e4a5d5d9
3 changed files with 227 additions and 49 deletions

View File

@@ -5,6 +5,8 @@ use codex_protocol::protocol::RealtimeConversationClosedEvent;
use codex_protocol::protocol::RealtimeConversationRealtimeEvent;
use codex_protocol::protocol::RealtimeConversationStartedEvent;
use codex_protocol::protocol::RealtimeEvent;
#[cfg(not(target_os = "linux"))]
use std::sync::atomic::AtomicUsize;
const REALTIME_CONVERSATION_PROMPT: &str = "You are in a realtime voice conversation in the Codex TUI. Respond conversationally and concisely.";
@@ -30,6 +32,10 @@ pub(super) struct RealtimeConversationUiState {
capture: Option<crate::voice::VoiceCapture>,
#[cfg(not(target_os = "linux"))]
audio_player: Option<crate::voice::RealtimeAudioPlayer>,
#[cfg(not(target_os = "linux"))]
// Shared queue depth lets capture suppress echoed speaker audio without
// taking the playback queue lock from the input callback.
playback_queued_samples: Arc<AtomicUsize>,
}
impl RealtimeConversationUiState {
@@ -192,16 +198,15 @@ impl ChatWidget {
None
}
fn realtime_footer_hint_items() -> Vec<(String, String)> {
vec![("/realtime".to_string(), "stop live voice".to_string())]
}
pub(super) fn start_realtime_conversation(&mut self) {
self.realtime_conversation.phase = RealtimeConversationPhase::Starting;
self.realtime_conversation.requested_close = false;
self.realtime_conversation.session_id = None;
self.realtime_conversation.warned_audio_only_submission = false;
self.set_footer_hint_override(Some(Self::realtime_footer_hint_items()));
self.set_footer_hint_override(Some(vec![(
"/realtime".to_string(),
"stop live voice".to_string(),
)]));
self.submit_op(Op::RealtimeConversationStart(ConversationStartParams {
prompt: REALTIME_CONVERSATION_PROMPT.to_string(),
session_id: None,
@@ -251,7 +256,10 @@ impl ChatWidget {
self.realtime_conversation.phase = RealtimeConversationPhase::Active;
self.realtime_conversation.session_id = ev.session_id;
self.realtime_conversation.warned_audio_only_submission = false;
self.set_footer_hint_override(Some(Self::realtime_footer_hint_items()));
self.set_footer_hint_override(Some(vec![(
"/realtime".to_string(),
"stop live voice".to_string(),
)]));
self.start_realtime_local_audio();
self.request_redraw();
}
@@ -264,11 +272,17 @@ impl ChatWidget {
RealtimeEvent::SessionUpdated { session_id, .. } => {
self.realtime_conversation.session_id = Some(session_id);
}
RealtimeEvent::InputAudioSpeechStarted(_) => {}
RealtimeEvent::InputAudioSpeechStarted(_) | RealtimeEvent::ResponseCancelled(_) => {
#[cfg(not(target_os = "linux"))]
if let Some(player) = &self.realtime_conversation.audio_player {
// Once the server detects user speech or the current response is cancelled,
// any buffered assistant audio is stale and should stop gating mic input.
player.clear();
}
}
RealtimeEvent::InputTranscriptDelta(_) => {}
RealtimeEvent::OutputTranscriptDelta(_) => {}
RealtimeEvent::AudioOut(frame) => self.enqueue_realtime_audio_out(&frame),
RealtimeEvent::ResponseCancelled(_) => {}
RealtimeEvent::ConversationItemAdded(_item) => {}
RealtimeEvent::ConversationItemDone { .. } => {}
RealtimeEvent::HandoffRequested(_) => {}
@@ -296,8 +310,11 @@ impl ChatWidget {
#[cfg(not(target_os = "linux"))]
{
if self.realtime_conversation.audio_player.is_none() {
self.realtime_conversation.audio_player =
crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
self.realtime_conversation.audio_player = crate::voice::RealtimeAudioPlayer::start(
&self.config,
Arc::clone(&self.realtime_conversation.playback_queued_samples),
)
.ok();
}
if let Some(player) = &self.realtime_conversation.audio_player
&& let Err(err) = player.enqueue_frame(frame)
@@ -324,6 +341,7 @@ impl ChatWidget {
let capture = match crate::voice::VoiceCapture::start_realtime(
&self.config,
self.app_event_tx.clone(),
Arc::clone(&self.realtime_conversation.playback_queued_samples),
) {
Ok(capture) => capture,
Err(err) => {
@@ -342,8 +360,11 @@ impl ChatWidget {
self.realtime_conversation.capture_stop_flag = Some(stop_flag.clone());
self.realtime_conversation.capture = Some(capture);
if self.realtime_conversation.audio_player.is_none() {
self.realtime_conversation.audio_player =
crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
self.realtime_conversation.audio_player = crate::voice::RealtimeAudioPlayer::start(
&self.config,
Arc::clone(&self.realtime_conversation.playback_queued_samples),
)
.ok();
}
std::thread::spawn(move || {
@@ -381,7 +402,10 @@ impl ChatWidget {
}
RealtimeAudioDeviceKind::Speaker => {
self.stop_realtime_speaker();
match crate::voice::RealtimeAudioPlayer::start(&self.config) {
match crate::voice::RealtimeAudioPlayer::start(
&self.config,
Arc::clone(&self.realtime_conversation.playback_queued_samples),
) {
Ok(player) => {
self.realtime_conversation.audio_player = Some(player);
}