diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock index e079e3af0e..aca09dedb8 100644 --- a/MODULE.bazel.lock +++ b/MODULE.bazel.lock @@ -1060,6 +1060,7 @@ "home_0.5.12": "{\"dependencies\":[{\"features\":[\"Win32_Foundation\",\"Win32_UI_Shell\",\"Win32_System_Com\"],\"name\":\"windows-sys\",\"req\":\"^0.61\",\"target\":\"cfg(windows)\"}],\"features\":{}}", "home_0.5.9": "{\"dependencies\":[{\"features\":[\"Win32_Foundation\",\"Win32_UI_Shell\",\"Win32_System_Com\"],\"name\":\"windows-sys\",\"req\":\"^0.52\",\"target\":\"cfg(windows)\"}],\"features\":{}}", "hostname_0.4.2": "{\"dependencies\":[{\"name\":\"cfg-if\",\"req\":\"^1\"},{\"name\":\"libc\",\"req\":\"^0.2\",\"target\":\"cfg(any(unix, target_os = \\\"redox\\\"))\"},{\"kind\":\"dev\",\"name\":\"similar-asserts\",\"req\":\"^1.6.1\"},{\"kind\":\"dev\",\"name\":\"version-sync\",\"req\":\"^0.9\"},{\"kind\":\"dev\",\"name\":\"windows-bindgen\",\"req\":\"^0.65\"},{\"name\":\"windows-link\",\"req\":\"^0.2\",\"target\":\"cfg(target_os = \\\"windows\\\")\"}],\"features\":{\"default\":[],\"set\":[]}}", + "hound_3.5.1": "{\"dependencies\":[{\"kind\":\"dev\",\"name\":\"cpal\",\"req\":\"^0.2.12\"}],\"features\":{}}", "http-body-util_0.1.3": "{\"dependencies\":[{\"name\":\"bytes\",\"req\":\"^1\"},{\"default_features\":false,\"name\":\"futures-core\",\"req\":\"^0.3\"},{\"default_features\":false,\"kind\":\"dev\",\"name\":\"futures-util\",\"req\":\"^0.3\"},{\"name\":\"http\",\"req\":\"^1\"},{\"name\":\"http-body\",\"req\":\"^1\"},{\"name\":\"pin-project-lite\",\"req\":\"^0.2\"},{\"features\":[\"sync\"],\"name\":\"tokio\",\"optional\":true,\"req\":\"^1\"},{\"features\":[\"macros\",\"rt\",\"sync\",\"rt-multi-thread\"],\"kind\":\"dev\",\"name\":\"tokio\",\"req\":\"^1\"}],\"features\":{\"channel\":[\"dep:tokio\"],\"default\":[],\"full\":[\"channel\"]}}", "http-body_0.4.6": "{\"dependencies\":[{\"name\":\"bytes\",\"req\":\"^1\"},{\"name\":\"http\",\"req\":\"^0.2\"},{\"name\":\"pin-project-lite\",\"req\":\"^0.2\"},{\"features\":[\"macros\",\"rt\"],\"kind\":\"dev\",\"name\":\"tokio\",\"req\":\"^1\"}],\"features\":{}}", "http-body_1.0.1": "{\"dependencies\":[{\"name\":\"bytes\",\"req\":\"^1\"},{\"name\":\"http\",\"req\":\"^1\"}],\"features\":{}}", diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 2bfcbf88c7..ad79780465 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -3563,6 +3563,7 @@ dependencies = [ "codex-arg0", "codex-chatgpt", "codex-cli", + "codex-client", "codex-cloud-requirements", "codex-config", "codex-connectors", @@ -3607,6 +3608,7 @@ dependencies = [ "diffy", "dirs", "dunce", + "hound", "image", "insta", "itertools 0.14.0", @@ -7022,6 +7024,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "hound" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f" + [[package]] name = "http" version = "0.2.12" @@ -10642,6 +10650,7 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "native-tls", "percent-encoding", "pin-project-lite", diff --git a/codex-rs/config/src/config_toml.rs b/codex-rs/config/src/config_toml.rs index b8702072b2..6bf18fb45a 100644 --- a/codex-rs/config/src/config_toml.rs +++ b/codex-rs/config/src/config_toml.rs @@ -292,6 +292,10 @@ pub struct ConfigToml { #[serde(default)] pub audio: Option, + /// Delay before holding Space on a non-empty composer switches into voice + /// transcription instead of inserting a literal space. + pub voice_transcription_space_hold_delay_ms: Option, + /// Experimental / do not use. Overrides only the realtime conversation /// websocket transport base URL (the `Op::RealtimeConversation` /// `/v1/realtime` diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index 7bab6c6bf2..6648451162 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -590,6 +590,9 @@ "use_linux_sandbox_bwrap": { "type": "boolean" }, + "voice_transcription": { + "type": "boolean" + }, "web_search": { "type": "boolean" }, @@ -3633,6 +3636,9 @@ "use_linux_sandbox_bwrap": { "type": "boolean" }, + "voice_transcription": { + "type": "boolean" + }, "web_search": { "type": "boolean" }, @@ -4043,6 +4049,12 @@ ], "description": "Collection of settings that are specific to the TUI." }, + "voice_transcription_space_hold_delay_ms": { + "description": "Delay before holding Space on a non-empty composer switches into voice transcription instead of inserting a literal space.", + "format": "uint64", + "minimum": 0.0, + "type": "integer" + }, "watchdog_interval_s": { "description": "Watchdog polling interval in seconds.", "format": "int64", diff --git a/codex-rs/core/src/config/config_tests.rs b/codex-rs/core/src/config/config_tests.rs index c002921043..a9c949b757 100644 --- a/codex-rs/core/src/config/config_tests.rs +++ b/codex-rs/core/src/config/config_tests.rs @@ -6045,6 +6045,8 @@ async fn test_precedence_fixture_with_o3_profile() -> std::io::Result<()> { personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), realtime_audio: RealtimeAudioConfig::default(), + voice_transcription_space_hold_delay_ms: + DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS, experimental_realtime_start_instructions: None, experimental_realtime_ws_base_url: None, experimental_realtime_ws_model: None, @@ -6244,6 +6246,7 @@ async fn test_precedence_fixture_with_gpt3_profile() -> std::io::Result<()> { personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), realtime_audio: RealtimeAudioConfig::default(), + voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS, experimental_realtime_start_instructions: None, experimental_realtime_ws_base_url: None, experimental_realtime_ws_model: None, @@ -6397,6 +6400,7 @@ async fn test_precedence_fixture_with_zdr_profile() -> std::io::Result<()> { personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), realtime_audio: RealtimeAudioConfig::default(), + voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS, experimental_realtime_start_instructions: None, experimental_realtime_ws_base_url: None, experimental_realtime_ws_model: None, @@ -6535,6 +6539,7 @@ async fn test_precedence_fixture_with_gpt5_profile() -> std::io::Result<()> { personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), realtime_audio: RealtimeAudioConfig::default(), + voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS, experimental_realtime_start_instructions: None, experimental_realtime_ws_base_url: None, experimental_realtime_ws_model: None, @@ -8677,6 +8682,29 @@ speaker = "Desk Speakers" Ok(()) } +#[tokio::test] +async fn voice_transcription_space_hold_delay_loads_from_config_toml() -> std::io::Result<()> { + let cfg: ConfigToml = toml::from_str( + r#" +voice_transcription_space_hold_delay_ms = 250 +"#, + ) + .expect("TOML deserialization should succeed"); + + assert_eq!(cfg.voice_transcription_space_hold_delay_ms, Some(250)); + + let codex_home = TempDir::new()?; + let config = Config::load_from_base_config_with_overrides( + cfg, + ConfigOverrides::default(), + codex_home.abs(), + ) + .await?; + + assert_eq!(config.voice_transcription_space_hold_delay_ms, 250); + Ok(()) +} + #[derive(Deserialize, Debug, PartialEq)] struct TuiTomlTest { #[serde(default, flatten)] diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index dee8b09d30..5abef4fb52 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -168,6 +168,8 @@ pub(crate) const DEFAULT_AGENT_MAX_DEPTH: i32 = 1; pub(crate) const DEFAULT_AGENT_JOB_MAX_RUNTIME_SECONDS: Option = None; const LOCAL_DEV_BUILD_VERSION: &str = "0.0.0"; pub(crate) const DEFAULT_WATCHDOG_INTERVAL_S: i64 = 10; +/// Default delay before holding Space on a non-empty composer switches into voice transcription. +pub const DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS: u64 = 1_000; pub const CONFIG_TOML_FILE: &str = "config.toml"; @@ -672,6 +674,10 @@ pub struct Config { /// Machine-local realtime audio device preferences used by realtime voice. pub realtime_audio: RealtimeAudioConfig, + /// Delay before holding Space on a non-empty composer switches into voice + /// transcription instead of inserting a literal space. + pub voice_transcription_space_hold_delay_ms: u64, + /// Experimental / do not use. Overrides only the realtime conversation /// websocket transport base URL (the `Op::RealtimeConversation` /// `/v1/realtime` @@ -2804,6 +2810,9 @@ impl Config { microphone: audio.microphone, speaker: audio.speaker, }), + voice_transcription_space_hold_delay_ms: cfg + .voice_transcription_space_hold_delay_ms + .unwrap_or(DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS), experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url, experimental_realtime_ws_model: cfg.experimental_realtime_ws_model, realtime: cfg diff --git a/codex-rs/features/src/lib.rs b/codex-rs/features/src/lib.rs index 9fc287e8fb..ee9bb0310e 100644 --- a/codex-rs/features/src/lib.rs +++ b/codex-rs/features/src/lib.rs @@ -209,6 +209,8 @@ pub enum Feature { Artifact, /// Enable Fast mode selection in the TUI and request layer. FastMode, + /// Enable voice transcription in the TUI composer. + VoiceTranscription, /// Enable experimental realtime voice conversation mode in the TUI. RealtimeConversation, /// Connect app-server to the ChatGPT remote control service. @@ -1014,6 +1016,12 @@ pub const FEATURES: &[FeatureSpec] = &[ stage: Stage::Stable, default_enabled: true, }, + FeatureSpec { + id: Feature::VoiceTranscription, + key: "voice_transcription", + stage: Stage::UnderDevelopment, + default_enabled: false, + }, FeatureSpec { id: Feature::RealtimeConversation, key: "realtime_conversation", diff --git a/codex-rs/tui/Cargo.toml b/codex-rs/tui/Cargo.toml index 300449a414..8fc4e04b4e 100644 --- a/codex-rs/tui/Cargo.toml +++ b/codex-rs/tui/Cargo.toml @@ -31,6 +31,7 @@ codex-app-server-protocol = { workspace = true } codex-arg0 = { workspace = true } codex-install-context = { workspace = true } codex-chatgpt = { workspace = true } +codex-client = { workspace = true } codex-cloud-requirements = { workspace = true } codex-config = { workspace = true } codex-connectors = { workspace = true } @@ -83,7 +84,7 @@ ratatui = { workspace = true, features = [ ] } ratatui-macros = { workspace = true } regex-lite = { workspace = true } -reqwest = { workspace = true, features = ["json"] } +reqwest = { workspace = true, features = ["json", "multipart"] } rmcp = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["preserve_order"] } @@ -122,6 +123,7 @@ tokio-util = { workspace = true, features = ["time"] } [target.'cfg(not(target_os = "linux"))'.dependencies] cpal = "0.15" +hound = "3.5" [target.'cfg(unix)'.dependencies] libc = { workspace = true } diff --git a/codex-rs/tui/src/app/event_dispatch.rs b/codex-rs/tui/src/app/event_dispatch.rs index d98888ba04..8c551e71d5 100644 --- a/codex-rs/tui/src/app/event_dispatch.rs +++ b/codex-rs/tui/src/app/event_dispatch.rs @@ -1723,6 +1723,25 @@ impl App { } }, #[cfg(not(target_os = "linux"))] + AppEvent::TranscriptionComplete { id, text } => { + self.chat_widget.replace_transcription(&id, &text); + tui.frame_requester().schedule_frame(); + } + #[cfg(not(target_os = "linux"))] + AppEvent::TranscriptionFailed { id, error } => { + self.chat_widget.fail_transcription(&id, &error); + } + #[cfg(not(target_os = "linux"))] + AppEvent::TranscriptionRetrying { + id, + attempt, + max_attempts, + } => { + self.chat_widget + .show_transcription_retrying(&id, attempt, max_attempts); + tui.frame_requester().schedule_frame(); + } + #[cfg(not(target_os = "linux"))] AppEvent::UpdateRecordingMeter { id, text } => { // Update in place to preserve the element id for subsequent frames. let updated = self.chat_widget.update_recording_meter_in_place(&id, &text); diff --git a/codex-rs/tui/src/app_event.rs b/codex-rs/tui/src/app_event.rs index 97fb26b7fc..edbcab7d40 100644 --- a/codex-rs/tui/src/app_event.rs +++ b/codex-rs/tui/src/app_event.rs @@ -729,6 +729,29 @@ pub(crate) enum AppEvent { text: String, }, + /// Voice transcription finished for the given placeholder id. + #[cfg(not(target_os = "linux"))] + TranscriptionComplete { + id: String, + text: String, + }, + + /// Voice transcription failed; remove the placeholder identified by `id`. + #[cfg(not(target_os = "linux"))] + TranscriptionFailed { + id: String, + #[allow(dead_code)] + error: String, + }, + + /// Voice transcription timed out or hit a transient failure and is retrying. + #[cfg(not(target_os = "linux"))] + TranscriptionRetrying { + id: String, + attempt: usize, + max_attempts: usize, + }, + /// Open the branch picker option from the review popup. OpenReviewBranchPicker(PathBuf), diff --git a/codex-rs/tui/src/bottom_pane/chat_composer.rs b/codex-rs/tui/src/bottom_pane/chat_composer.rs index c879743425..6897988d4a 100644 --- a/codex-rs/tui/src/bottom_pane/chat_composer.rs +++ b/codex-rs/tui/src/bottom_pane/chat_composer.rs @@ -41,6 +41,10 @@ //! `Enter` submits immediately. `Tab` requests queuing while a task is running; if no task is //! running, `Tab` submits just like Enter so input is never dropped. //! `Tab` does not submit when entering a `!` shell command. +//! When a voice transcription placeholder is still resolving, `Enter`/`Tab` records the submit or +//! queue intent and waits for the transcription result before running the normal submission path. +//! While that intent is pending, the draft is treated as committed and further key edits are +//! ignored. //! //! On submit/queue paths, the composer: //! @@ -121,11 +125,22 @@ //! overall state machine, since it affects which transitions are even possible from a given UI //! state. //! +//! # Voice Hold-To-Talk Without Key Release +//! +//! On terminals that do not report `KeyEventKind::Release`, space hold-to-talk uses repeated +//! space key events as "still held" evidence: +//! +//! - For pending holds (non-empty composer), if the configured timeout elapses without any +//! repeated space event, we treat the key as a normal typed space. +//! - If repeated space events are seen before timeout, we proceed with hold-to-talk. +//! - While recording, repeated space events keep the recording alive; if they stop for a short +//! window, we stop and transcribe. use crate::bottom_pane::footer::goal_status_indicator_line; use crate::bottom_pane::footer::mode_indicator_line; use crate::key_hint; use crate::key_hint::KeyBinding; use crate::key_hint::has_ctrl_or_alt; +use crate::legacy_core::config::DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS; use crate::line_truncation::truncate_line_with_ellipsis_if_overflow; use crate::ui_consts::FOOTER_INDENT_COLS; use crossterm::event::KeyCode; @@ -230,12 +245,21 @@ use std::collections::HashSet; use std::collections::VecDeque; use std::ops::Range; use std::path::PathBuf; +use std::sync::Arc; +#[cfg(not(target_os = "linux"))] +use std::sync::Mutex; +use std::sync::atomic::AtomicBool; +#[cfg(not(target_os = "linux"))] +use std::sync::atomic::Ordering; +#[cfg(not(target_os = "linux"))] +use std::thread; use std::time::Duration; use std::time::Instant; #[cfg(test)] use ratatui::style::Color; - +#[cfg(not(target_os = "linux"))] +use tokio::runtime::Handle; /// If the pasted content exceeds this number of characters, replace it with a /// placeholder in the UI. const LARGE_PASTE_CHAR_THRESHOLD: usize = 1000; @@ -297,6 +321,8 @@ pub(crate) struct ChatComposerConfig { pub(crate) slash_commands_enabled: bool, /// Whether pasting a file path can attach local images. pub(crate) image_paste_enabled: bool, + /// Delay before holding Space on a non-empty draft switches into voice capture. + pub(crate) voice_transcription_space_hold_delay_ms: u64, } impl Default for ChatComposerConfig { @@ -305,6 +331,8 @@ impl Default for ChatComposerConfig { popups_enabled: true, slash_commands_enabled: true, image_paste_enabled: true, + voice_transcription_space_hold_delay_ms: + DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS, } } } @@ -319,6 +347,63 @@ impl ChatComposerConfig { popups_enabled: false, slash_commands_enabled: false, image_paste_enabled: false, + voice_transcription_space_hold_delay_ms: + DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS, + } + } +} + +#[derive(Default)] +struct VoiceState { + transcription_enabled: bool, + space_hold_started_at: Option, + space_hold_element_id: Option, + space_hold_trigger: Option>, + key_release_supported: bool, + space_hold_repeat_seen: bool, + #[cfg(not(target_os = "linux"))] + voice: Option, + #[cfg(not(target_os = "linux"))] + recording_placeholder_id: Option, + #[cfg(not(target_os = "linux"))] + space_recording_started_at: Option, + #[cfg(not(target_os = "linux"))] + space_recording_last_repeat_at: Option, +} + +#[cfg(not(target_os = "linux"))] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum TranscriptionSubmissionDisposition { + /// Submit as the next user turn once transcription completes. + Submit, + /// Queue for the end of the active turn once transcription completes. + Queue, +} + +#[cfg(not(target_os = "linux"))] +#[derive(Clone, Debug, PartialEq, Eq)] +struct PendingTranscriptionSubmission { + /// Placeholder whose final text must arrive before submission can continue. + placeholder_id: String, + disposition: TranscriptionSubmissionDisposition, +} + +#[cfg(not(target_os = "linux"))] +#[derive(Clone, Debug, PartialEq, Eq)] +enum RecordingStopOutcome { + /// No active recording was available to stop. + NoRecording, + /// Recording ended without starting transcription, for example because it was too short. + NoTranscription, + /// Transcription is now running for the retained placeholder. + Transcribing { placeholder_id: String }, +} + +impl VoiceState { + fn new(key_release_supported: bool) -> Self { + Self { + key_release_supported, + ..Default::default() } } } @@ -344,6 +429,12 @@ pub(crate) struct ChatComposer { /// `[Image #M+1]..[Image #N]`, where `M` is the number of remote images. attached_images: Vec, placeholder_text: String, + voice_state: VoiceState, + #[cfg(not(target_os = "linux"))] + pending_transcription_submission: Option, + // Spinner control flags keyed by placeholder id; set to true to stop. + #[cfg(not(target_os = "linux"))] + spinner_stop_flags: HashMap>, is_task_running: bool, /// When false, the composer is temporarily read-only (e.g. during sandbox setup). input_enabled: bool, @@ -545,6 +636,11 @@ impl ChatComposer { frame_requester: None, attached_images: Vec::new(), placeholder_text, + voice_state: VoiceState::new(enhanced_keys_supported), + #[cfg(not(target_os = "linux"))] + pending_transcription_submission: None, + #[cfg(not(target_os = "linux"))] + spinner_stop_flags: HashMap::new(), is_task_running: false, input_enabled: true, input_disabled_placeholder: None, @@ -747,6 +843,32 @@ impl ChatComposer { /// Compatibility shim for tests that still toggle the removed steer mode flag. #[cfg(test)] pub fn set_steer_enabled(&mut self, _enabled: bool) {} + + pub fn set_voice_transcription_enabled(&mut self, enabled: bool) { + self.voice_state.transcription_enabled = enabled; + if !enabled { + self.voice_state.space_hold_started_at = None; + if let Some(id) = self.voice_state.space_hold_element_id.take() { + let _ = self.textarea.replace_element_by_id(&id, " "); + } + self.voice_state.space_hold_trigger = None; + self.voice_state.space_hold_repeat_seen = false; + #[cfg(not(target_os = "linux"))] + { + self.pending_transcription_submission = None; + } + } + } + + pub(crate) fn set_voice_transcription_space_hold_delay_ms(&mut self, delay_ms: u64) { + self.config.voice_transcription_space_hold_delay_ms = delay_ms; + } + + #[cfg(not(target_os = "linux"))] + fn voice_transcription_enabled(&self) -> bool { + self.voice_state.transcription_enabled && cfg!(not(target_os = "linux")) + } + /// Centralized feature gating keeps config checks out of call sites. fn popups_enabled(&self) -> bool { self.config.popups_enabled @@ -969,6 +1091,10 @@ impl ChatComposer { /// remote images). Cursor is placed at the end after rebuilding elements. pub(crate) fn apply_external_edit(&mut self, text: String) { self.pending_pastes.clear(); + #[cfg(not(target_os = "linux"))] + { + self.pending_transcription_submission = None; + } let (text, _) = self.imported_text_for_textarea(text, Vec::new()); // Count placeholder occurrences in the new text. @@ -1158,6 +1284,13 @@ impl ChatComposer { local_image_paths: Vec, mention_bindings: Vec, ) { + #[cfg(not(target_os = "linux"))] + self.stop_all_transcription_spinners(); + #[cfg(not(target_os = "linux"))] + { + self.pending_transcription_submission = None; + } + // Clear any existing content, placeholders, and attachments first. self.textarea.set_text_clearing_elements(""); self.is_bash_mode = false; @@ -1546,11 +1679,20 @@ impl ChatComposer { /// Handle a key event coming from the main UI. pub fn handle_key_event(&mut self, key_event: KeyEvent) -> (InputResult, bool) { + if matches!(key_event.kind, KeyEventKind::Release) { + self.voice_state.key_release_supported = true; + } + + if let Some(result) = self.handle_key_event_while_recording(key_event) { + return result; + } + if !self.input_enabled { return (InputResult::None, false); } - if matches!(key_event.kind, KeyEventKind::Release) { + #[cfg(not(target_os = "linux"))] + if self.pending_transcription_submission.is_some() { return (InputResult::None, false); } @@ -1562,6 +1704,29 @@ impl ChatComposer { return self.begin_history_search(); } + // Outside of recording, ignore all key releases globally except for Space, + // which is handled explicitly for hold-to-talk behavior below. + if matches!(key_event.kind, KeyEventKind::Release) + && !matches!(key_event.code, KeyCode::Char(' ')) + { + return (InputResult::None, false); + } + + if self.voice_state.space_hold_started_at.is_some() + && !matches!(key_event.code, KeyCode::Char(' ')) + { + self.voice_state.space_hold_started_at = None; + if let Some(id) = self.voice_state.space_hold_element_id.take() { + let _ = self.textarea.replace_element_by_id(&id, " "); + } + self.voice_state.space_hold_trigger = None; + self.voice_state.space_hold_repeat_seen = false; + } + + if let Some(result) = self.handle_voice_space_key_event(&key_event) { + return result; + } + let result = match &mut self.active_popup { ActivePopup::Command(_) => self.handle_key_event_with_slash_popup(key_event), ActivePopup::File(_) => self.handle_key_event_with_file_popup(key_event), @@ -2574,6 +2739,12 @@ impl ChatComposer { should_queue: bool, now: Instant, ) -> (InputResult, bool) { + #[cfg(not(target_os = "linux"))] + if self.defer_active_transcription_submission(Self::transcription_disposition(should_queue)) + { + return (InputResult::None, true); + } + if should_queue { let raw_text = self.textarea.text(); let defer_slash_validation = @@ -2596,7 +2767,6 @@ impl ChatComposer { true, ); } - return (InputResult::None, true); } // If the first line is a bare built-in slash command (no args), @@ -3046,6 +3216,150 @@ impl ChatComposer { } } + #[cfg(target_os = "linux")] + fn handle_voice_space_key_event( + &mut self, + _key_event: &KeyEvent, + ) -> Option<(InputResult, bool)> { + None + } + + #[cfg(not(target_os = "linux"))] + fn handle_voice_space_key_event( + &mut self, + key_event: &KeyEvent, + ) -> Option<(InputResult, bool)> { + if !self.voice_transcription_enabled() || !matches!(key_event.code, KeyCode::Char(' ')) { + return None; + } + match key_event.kind { + KeyEventKind::Press => { + if self.paste_burst.is_active() { + return None; + } + + // If textarea is empty, start recording immediately without inserting a space. + if self.textarea.text().is_empty() { + if self.start_recording_with_placeholder() { + return Some((InputResult::None, true)); + } + return None; + } + + // If a hold is already pending, swallow further press events to + // avoid inserting multiple spaces and resetting the timer on key repeat. + if self.voice_state.space_hold_started_at.is_some() { + if !self.voice_state.key_release_supported { + self.voice_state.space_hold_repeat_seen = true; + } + return Some((InputResult::None, false)); + } + + // Insert a named element that renders as a space so we can later + // remove it on timeout or convert it to a plain space on release. + let elem_id = self.next_id(); + self.textarea.insert_named_element(" ", elem_id.clone()); + + // Record pending hold metadata. + self.voice_state.space_hold_started_at = Some(Instant::now()); + self.voice_state.space_hold_element_id = Some(elem_id); + self.voice_state.space_hold_repeat_seen = false; + + // Spawn a delayed task to flip an atomic flag; we check it on next key event. + let flag = Arc::new(AtomicBool::new(false)); + let frame = self.frame_requester.clone(); + Self::schedule_space_hold_timer( + flag.clone(), + frame, + self.config.voice_transcription_space_hold_delay_ms, + ); + self.voice_state.space_hold_trigger = Some(flag); + + Some((InputResult::None, true)) + } + // If we see a repeat before release, handling occurs in the top-level pending block. + KeyEventKind::Repeat => { + // Swallow repeats while a hold is pending to avoid extra spaces. + if self.voice_state.space_hold_started_at.is_some() { + if !self.voice_state.key_release_supported { + self.voice_state.space_hold_repeat_seen = true; + } + return Some((InputResult::None, false)); + } + // Fallback: if no pending hold, treat as normal input. + None + } + // Space release without pending (fallback): treat as normal input. + KeyEventKind::Release => { + // If a hold is pending, convert the element to a plain space and clear state. + self.voice_state.space_hold_started_at = None; + if let Some(id) = self.voice_state.space_hold_element_id.take() { + let _ = self.textarea.replace_element_by_id(&id, " "); + } + self.voice_state.space_hold_trigger = None; + self.voice_state.space_hold_repeat_seen = false; + Some((InputResult::None, true)) + } + } + } + + #[cfg(target_os = "linux")] + fn handle_key_event_while_recording( + &mut self, + _key_event: KeyEvent, + ) -> Option<(InputResult, bool)> { + None + } + + #[cfg(not(target_os = "linux"))] + fn handle_key_event_while_recording( + &mut self, + key_event: KeyEvent, + ) -> Option<(InputResult, bool)> { + if self.voice_state.voice.is_some() { + let should_stop = if self.voice_state.key_release_supported { + match key_event.kind { + KeyEventKind::Release => matches!(key_event.code, KeyCode::Char(' ')), + KeyEventKind::Press | KeyEventKind::Repeat => { + !matches!(key_event.code, KeyCode::Char(' ')) + } + } + } else { + match key_event.kind { + KeyEventKind::Release => matches!(key_event.code, KeyCode::Char(' ')), + KeyEventKind::Press | KeyEventKind::Repeat => { + if matches!(key_event.code, KeyCode::Char(' ')) { + self.voice_state.space_recording_last_repeat_at = Some(Instant::now()); + false + } else { + true + } + } + } + }; + + if should_stop { + let disposition = self.transcription_disposition_for_key(&key_event); + let outcome = self.stop_recording_and_start_transcription(); + let needs_redraw = !matches!(&outcome, RecordingStopOutcome::NoRecording); + if let (RecordingStopOutcome::Transcribing { placeholder_id }, Some(disposition)) = + (outcome, disposition) + { + self.pending_transcription_submission = Some(PendingTranscriptionSubmission { + placeholder_id, + disposition, + }); + } + return Some((InputResult::None, needs_redraw)); + } + + // Swallow non-stopping keys while recording. + return Some((InputResult::None, false)); + } + + None + } + fn is_bang_shell_command(&self) -> bool { self.current_text().trim_start().starts_with('!') } @@ -3877,6 +4191,40 @@ impl ChatComposer { } } + #[cfg(not(target_os = "linux"))] + pub(crate) fn is_recording(&self) -> bool { + self.voice_state.voice.is_some() + } + + #[cfg(not(target_os = "linux"))] + fn schedule_space_hold_timer( + flag: Arc, + frame: Option, + hold_delay_millis: u64, + ) { + if let Ok(handle) = Handle::try_current() { + let flag_clone = flag; + let frame_clone = frame; + handle.spawn(async move { + tokio::time::sleep(Duration::from_millis(hold_delay_millis)).await; + Self::complete_space_hold_timer(flag_clone, frame_clone); + }); + } else { + thread::spawn(move || { + thread::sleep(Duration::from_millis(hold_delay_millis)); + Self::complete_space_hold_timer(flag, frame); + }); + } + } + + #[cfg(not(target_os = "linux"))] + fn complete_space_hold_timer(flag: Arc, frame: Option) { + flag.store(true, Ordering::Relaxed); + if let Some(frame) = frame { + frame.schedule_frame(); + } + } + pub(crate) fn set_status_line(&mut self, status_line: Option>) -> bool { if self.status_line_value == status_line { return false; @@ -3934,10 +4282,364 @@ fn footer_insert_newline_key( #[cfg(not(target_os = "linux"))] impl ChatComposer { + fn transcription_disposition(should_queue: bool) -> TranscriptionSubmissionDisposition { + if should_queue { + TranscriptionSubmissionDisposition::Queue + } else { + TranscriptionSubmissionDisposition::Submit + } + } + + fn transcription_disposition_for_key( + &self, + key_event: &KeyEvent, + ) -> Option { + match key_event { + KeyEvent { + code: KeyCode::Enter, + modifiers: KeyModifiers::NONE, + kind: KeyEventKind::Press | KeyEventKind::Repeat, + .. + } => Some(TranscriptionSubmissionDisposition::Submit), + KeyEvent { + code: KeyCode::Tab, + modifiers: KeyModifiers::NONE, + kind: KeyEventKind::Press, + .. + } if !self.is_bang_shell_command() => { + Some(Self::transcription_disposition(self.is_task_running)) + } + _ => None, + } + } + + fn active_transcription_placeholder_id(&self) -> Option { + self.spinner_stop_flags + .keys() + .find(|id| self.textarea.named_element_range(id).is_some()) + .cloned() + } + + fn defer_active_transcription_submission( + &mut self, + disposition: TranscriptionSubmissionDisposition, + ) -> bool { + match self.pending_transcription_submission { + Some(_) => true, + None => match self.active_transcription_placeholder_id() { + Some(placeholder_id) => { + self.pending_transcription_submission = Some(PendingTranscriptionSubmission { + placeholder_id, + disposition, + }); + true + } + None => false, + }, + } + } + + fn complete_pending_transcription_submission(&mut self, id: &str) -> InputResult { + match self.pending_transcription_submission.take() { + Some(PendingTranscriptionSubmission { + placeholder_id, + disposition, + }) if placeholder_id == id => match disposition { + TranscriptionSubmissionDisposition::Submit => { + self.handle_submission(/*should_queue*/ false).0 + } + TranscriptionSubmissionDisposition::Queue => { + self.handle_submission(/*should_queue*/ true).0 + } + }, + Some(pending_submission) => { + self.pending_transcription_submission = Some(pending_submission); + InputResult::None + } + None => InputResult::None, + } + } + + pub(crate) fn process_space_hold_trigger(&mut self) { + if self.voice_transcription_enabled() + && let Some(flag) = self.voice_state.space_hold_trigger.as_ref() + && flag.load(Ordering::Relaxed) + && self.voice_state.space_hold_started_at.is_some() + && self.voice_state.voice.is_none() + { + let _ = self.on_space_hold_timeout(); + } + + const SPACE_REPEAT_INITIAL_GRACE_MILLIS: u64 = 700; + const SPACE_REPEAT_IDLE_TIMEOUT_MILLIS: u64 = 250; + if !self.voice_state.key_release_supported && self.voice_state.voice.is_some() { + let now = Instant::now(); + let initial_grace = Duration::from_millis(SPACE_REPEAT_INITIAL_GRACE_MILLIS); + let repeat_idle_timeout = Duration::from_millis(SPACE_REPEAT_IDLE_TIMEOUT_MILLIS); + if let Some(started_at) = self.voice_state.space_recording_started_at + && now.saturating_duration_since(started_at) >= initial_grace + { + let should_stop = match self.voice_state.space_recording_last_repeat_at { + Some(last_repeat_at) => { + now.saturating_duration_since(last_repeat_at) >= repeat_idle_timeout + } + None => true, + }; + if should_stop { + let _ = self.stop_recording_and_start_transcription(); + } + } + } + } + + /// Called when the configured non-empty-composer space hold timeout elapses. + /// + /// On terminals without key-release reporting, this only transitions into voice capture if we + /// observed repeated Space events while pending; otherwise the keypress is treated as a typed + /// space. + pub(crate) fn on_space_hold_timeout(&mut self) -> bool { + if !self.voice_transcription_enabled() { + return false; + } + if self.voice_state.voice.is_some() { + return false; + } + if self.voice_state.space_hold_started_at.is_some() { + if !self.voice_state.key_release_supported && !self.voice_state.space_hold_repeat_seen { + if let Some(id) = self.voice_state.space_hold_element_id.take() { + let _ = self.textarea.replace_element_by_id(&id, " "); + } + self.voice_state.space_hold_started_at = None; + self.voice_state.space_hold_trigger = None; + self.voice_state.space_hold_repeat_seen = false; + return true; + } + + // Preserve the typed space when transitioning into voice capture, but + // avoid duplicating an existing trailing space. In either case, + // convert/remove the temporary named element before inserting the + // recording/transcribing placeholder. + if let Some(id) = self.voice_state.space_hold_element_id.take() { + let replacement = if self + .textarea + .named_element_range(&id) + .and_then(|range| self.textarea.text()[..range.start].chars().next_back()) + .is_some_and(|ch| ch == ' ') + { + "" + } else { + " " + }; + let _ = self.textarea.replace_element_by_id(&id, replacement); + } + // Clear pending state before starting capture + self.voice_state.space_hold_started_at = None; + self.voice_state.space_hold_trigger = None; + self.voice_state.space_hold_repeat_seen = false; + + // Start voice capture + self.start_recording_with_placeholder() + } else { + false + } + } + + /// Stop recording if active, update the placeholder, and spawn background transcription. + fn stop_recording_and_start_transcription(&mut self) -> RecordingStopOutcome { + let Some(vc) = self.voice_state.voice.take() else { + return RecordingStopOutcome::NoRecording; + }; + self.voice_state.space_recording_started_at = None; + self.voice_state.space_recording_last_repeat_at = None; + match vc.stop() { + Ok(audio) => { + // If the recording is too short, remove the placeholder immediately + // and skip the transcribing state entirely. + let total_samples = audio.data.len() as f32; + let samples_per_second = (audio.sample_rate as f32) * (audio.channels as f32); + let duration_seconds = if samples_per_second > 0.0 { + total_samples / samples_per_second + } else { + 0.0 + }; + const MIN_DURATION_SECONDS: f32 = 1.0; + if duration_seconds < MIN_DURATION_SECONDS { + if let Some(id) = self.voice_state.recording_placeholder_id.take() { + let _ = self.textarea.replace_element_by_id(&id, ""); + } + return RecordingStopOutcome::NoTranscription; + } + + // Otherwise, update the placeholder to show a spinner and proceed. + let id = match self.voice_state.recording_placeholder_id.take() { + Some(id) => id, + None => self.next_id(), + }; + + let placeholder_range = self.textarea.named_element_range(&id); + let prompt_source = if let Some(range) = &placeholder_range { + self.textarea.text()[..range.start].to_string() + } else { + self.textarea.text().to_string() + }; + + // Initialize with first spinner frame immediately. + let _ = self.textarea.update_named_element_by_id(&id, "⠋"); + // Spawn animated braille spinner until transcription finishes (or times out). + self.spawn_transcribing_spinner(id.clone()); + let tx = self.app_event_tx.clone(); + crate::voice::transcribe_async(id.clone(), audio, Some(prompt_source), tx); + RecordingStopOutcome::Transcribing { placeholder_id: id } + } + Err(e) => { + tracing::error!("failed to stop voice capture: {e}"); + RecordingStopOutcome::NoTranscription + } + } + } + + /// Start voice capture and insert a placeholder element for the live meter. + /// Returns true if recording began and UI should redraw; false on failure. + fn start_recording_with_placeholder(&mut self) -> bool { + match crate::voice::VoiceCapture::start() { + Ok(vc) => { + self.voice_state.voice = Some(vc); + if self.voice_state.key_release_supported { + self.voice_state.space_recording_started_at = None; + } else { + self.voice_state.space_recording_started_at = Some(Instant::now()); + } + self.voice_state.space_recording_last_repeat_at = None; + // Insert visible placeholder for the meter (no label) + let id = self.next_id(); + self.textarea.insert_named_element("", id.clone()); + self.voice_state.recording_placeholder_id = Some(id); + // Spawn metering animation + if let Some(v) = &self.voice_state.voice { + let data = v.data_arc(); + let stop = v.stopped_flag(); + let sr = v.sample_rate(); + let ch = v.channels(); + let peak = v.last_peak_arc(); + if let Some(idref) = &self.voice_state.recording_placeholder_id { + self.spawn_recording_meter(idref.clone(), sr, ch, data, peak, stop); + } + } + true + } + Err(e) => { + self.voice_state.space_recording_started_at = None; + self.voice_state.space_recording_last_repeat_at = None; + tracing::error!("failed to start voice capture: {e}"); + false + } + } + } + + fn spawn_recording_meter( + &self, + id: String, + _sample_rate: u32, + _channels: u16, + _data: Arc>>, + last_peak: Arc, + stop: Arc, + ) { + let tx = self.app_event_tx.clone(); + let task = move || { + use std::time::Duration; + let mut meter = crate::voice::RecordingMeterState::new(); + loop { + if stop.load(Ordering::Relaxed) { + break; + } + let text = meter.next_text(last_peak.load(Ordering::Relaxed)); + tx.send(crate::app_event::AppEvent::UpdateRecordingMeter { + id: id.clone(), + text, + }); + + thread::sleep(Duration::from_millis(100)); + } + }; + + if let Ok(handle) = Handle::try_current() { + handle.spawn_blocking(task); + } else { + thread::spawn(task); + } + } + + fn spawn_transcribing_spinner(&mut self, id: String) { + self.stop_transcription_spinner(&id); + let stop = Arc::new(AtomicBool::new(false)); + self.spinner_stop_flags + .insert(id.clone(), Arc::clone(&stop)); + + let tx = self.app_event_tx.clone(); + let task = move || { + use std::time::Duration; + let frames: Vec<&'static str> = vec!["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]; + let mut i: usize = 0; + // Safety stop after ~60s to avoid a runaway task if events are lost. + let max_ticks = 600usize; // 600 * 100ms = 60s + for _ in 0..max_ticks { + if stop.load(Ordering::Relaxed) { + break; + } + let text = frames[i % frames.len()].to_string(); + tx.send(crate::app_event::AppEvent::UpdateRecordingMeter { + id: id.clone(), + text, + }); + i = i.wrapping_add(1); + thread::sleep(Duration::from_millis(100)); + } + }; + + if let Ok(handle) = Handle::try_current() { + handle.spawn_blocking(task); + } else { + thread::spawn(task); + } + } + + fn stop_transcription_spinner(&mut self, id: &str) { + if let Some(flag) = self.spinner_stop_flags.remove(id) { + flag.store(true, Ordering::Relaxed); + } + } + + fn stop_all_transcription_spinners(&mut self) { + for (_id, flag) in self.spinner_stop_flags.drain() { + flag.store(true, Ordering::Relaxed); + } + } + + pub fn replace_transcription(&mut self, id: &str, text: &str) -> InputResult { + self.stop_transcription_spinner(id); + if self.textarea.replace_element_by_id(id, text) { + self.complete_pending_transcription_submission(id) + } else { + InputResult::None + } + } + pub fn update_recording_meter_in_place(&mut self, id: &str, text: &str) -> bool { self.textarea.update_named_element_by_id(id, text) } + pub fn show_transcription_retrying( + &mut self, + id: &str, + attempt: usize, + max_attempts: usize, + ) -> bool { + self.stop_transcription_spinner(id); + self.textarea + .update_named_element_by_id(id, &format!("retrying {attempt}/{max_attempts}")) + } + pub fn insert_recording_meter_placeholder(&mut self, text: &str) -> String { let id = self.next_id(); self.textarea.insert_named_element(text, id.clone()); @@ -3947,6 +4649,18 @@ impl ChatComposer { pub fn remove_recording_meter_placeholder(&mut self, id: &str) { let _ = self.textarea.replace_element_by_id(id, ""); } + + pub fn remove_transcription_placeholder(&mut self, id: &str) { + self.stop_transcription_spinner(id); + if self + .pending_transcription_submission + .as_ref() + .is_some_and(|pending| pending.placeholder_id == id) + { + self.pending_transcription_submission = None; + } + let _ = self.textarea.replace_element_by_id(id, ""); + } } fn skill_description(skill: &SkillMetadata) -> Option { @@ -7357,6 +8071,366 @@ mod tests { assert_queued_shell(" !echo hi", "!echo hi"); } + #[test] + fn voice_transcription_disabled_treats_space_as_normal_input() { + use crossterm::event::KeyCode; + use crossterm::event::KeyEvent; + use crossterm::event::KeyEventKind; + use crossterm::event::KeyModifiers; + + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ true, + ); + composer.set_text_content("x".to_string(), Vec::new(), Vec::new()); + composer.move_cursor_to_end(); + + let _ = composer.handle_key_event(KeyEvent::new(KeyCode::Char(' '), KeyModifiers::NONE)); + let _ = composer.handle_key_event(KeyEvent::new_with_kind( + KeyCode::Char(' '), + KeyModifiers::NONE, + KeyEventKind::Release, + )); + + assert_eq!("x ", composer.textarea.text()); + assert!(composer.voice_state.space_hold_started_at.is_none()); + assert!(composer.voice_state.space_hold_element_id.is_none()); + assert!(composer.voice_state.space_hold_trigger.is_none()); + assert!(!composer.voice_state.space_hold_repeat_seen); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn space_hold_timeout_without_release_or_repeat_keeps_typed_space() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + composer.set_voice_transcription_enabled(/*enabled*/ true); + + composer.set_text_content("x".to_string(), Vec::new(), Vec::new()); + composer.move_cursor_to_end(); + let elem_id = "space-hold".to_string(); + composer.textarea.insert_named_element(" ", elem_id.clone()); + composer.voice_state.space_hold_started_at = Some(Instant::now()); + composer.voice_state.space_hold_element_id = Some(elem_id); + composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true))); + composer.voice_state.key_release_supported = false; + composer.voice_state.space_hold_repeat_seen = false; + assert_eq!("x ", composer.textarea.text()); + + composer.process_space_hold_trigger(); + + assert_eq!("x ", composer.textarea.text()); + assert!(composer.voice_state.space_hold_started_at.is_none()); + assert!(!composer.voice_state.space_hold_repeat_seen); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn space_hold_timeout_with_repeat_uses_hold_path_without_release() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + composer.set_voice_transcription_enabled(/*enabled*/ true); + + composer.set_text_content("x".to_string(), Vec::new(), Vec::new()); + composer.move_cursor_to_end(); + let elem_id = "space-hold".to_string(); + composer.textarea.insert_named_element(" ", elem_id.clone()); + composer.voice_state.space_hold_started_at = Some(Instant::now()); + composer.voice_state.space_hold_element_id = Some(elem_id); + composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true))); + composer.voice_state.key_release_supported = false; + composer.voice_state.space_hold_repeat_seen = true; + + composer.process_space_hold_trigger(); + + assert_eq!("x ", composer.textarea.text()); + assert!(composer.voice_state.space_hold_started_at.is_none()); + assert!(!composer.voice_state.space_hold_repeat_seen); + if composer.is_recording() { + let _ = composer.stop_recording_and_start_transcription(); + } + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn space_hold_timeout_with_repeat_does_not_duplicate_existing_space() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + composer.set_voice_transcription_enabled(/*enabled*/ true); + + composer.set_text_content("x ".to_string(), Vec::new(), Vec::new()); + composer.move_cursor_to_end(); + let elem_id = "space-hold".to_string(); + composer.textarea.insert_named_element(" ", elem_id.clone()); + composer.voice_state.space_hold_started_at = Some(Instant::now()); + composer.voice_state.space_hold_element_id = Some(elem_id); + composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true))); + composer.voice_state.key_release_supported = false; + composer.voice_state.space_hold_repeat_seen = true; + + composer.process_space_hold_trigger(); + + assert_eq!("x ", composer.textarea.text()); + assert!(composer.voice_state.space_hold_started_at.is_none()); + assert!(!composer.voice_state.space_hold_repeat_seen); + if composer.is_recording() { + let _ = composer.stop_recording_and_start_transcription(); + } + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn configurable_space_hold_delay_is_used_for_non_empty_drafts() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + composer.set_voice_transcription_enabled(/*enabled*/ true); + composer.set_voice_transcription_space_hold_delay_ms(/*delay_ms*/ 1); + composer.set_text_content("x".to_string(), Vec::new(), Vec::new()); + composer.move_cursor_to_end(); + + let _ = composer.handle_key_event(KeyEvent::new(KeyCode::Char(' '), KeyModifiers::NONE)); + std::thread::sleep(Duration::from_millis(50)); + composer.process_space_hold_trigger(); + + assert!(composer.voice_state.space_hold_started_at.is_none()); + assert!(composer.voice_state.space_hold_trigger.is_none()); + assert_eq!("x ", composer.textarea.text()); + if composer.is_recording() { + let _ = composer.stop_recording_and_start_transcription(); + } + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn replace_transcription_stops_spinner_for_placeholder() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + + let id = "voice-placeholder".to_string(); + composer.textarea.insert_named_element("", id.clone()); + let flag = Arc::new(AtomicBool::new(false)); + composer + .spinner_stop_flags + .insert(id.clone(), Arc::clone(&flag)); + + composer.replace_transcription(&id, "transcribed text"); + + assert!(flag.load(Ordering::Relaxed)); + assert!(!composer.spinner_stop_flags.contains_key(&id)); + assert_eq!(composer.textarea.text(), "transcribed text"); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn transcription_retrying_stops_spinner_and_updates_placeholder() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + + let id = "voice-placeholder".to_string(); + composer.textarea.insert_named_element("", id.clone()); + let flag = Arc::new(AtomicBool::new(false)); + composer + .spinner_stop_flags + .insert(id.clone(), Arc::clone(&flag)); + + let updated = + composer.show_transcription_retrying(&id, /*attempt*/ 2, /*max_attempts*/ 3); + + assert!(updated); + assert!(flag.load(Ordering::Relaxed)); + assert!(!composer.spinner_stop_flags.contains_key(&id)); + assert_eq!(composer.textarea.text(), "retrying 2/3"); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn enter_defers_submission_until_transcription_completes() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + + let id = composer.insert_recording_meter_placeholder("⠋"); + composer + .spinner_stop_flags + .insert(id.clone(), Arc::new(AtomicBool::new(false))); + + let (result, _needs_redraw) = + composer.handle_key_event(KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); + + assert_eq!(InputResult::None, result); + assert_eq!("⠋", composer.textarea.text()); + + let (result, _needs_redraw) = + composer.handle_key_event(KeyEvent::new(KeyCode::Char('x'), KeyModifiers::NONE)); + + assert_eq!(InputResult::None, result); + assert_eq!("⠋", composer.textarea.text()); + + let result = composer.replace_transcription(&id, "transcribed text"); + + assert_eq!( + InputResult::Submitted { + text: "transcribed text".to_string(), + text_elements: Vec::new() + }, + result + ); + assert_eq!("", composer.textarea.text()); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn tab_defers_queue_until_transcription_completes() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + composer.set_task_running(/*running*/ true); + + let id = composer.insert_recording_meter_placeholder("⠋"); + composer + .spinner_stop_flags + .insert(id.clone(), Arc::new(AtomicBool::new(false))); + + let (result, _needs_redraw) = + composer.handle_key_event(KeyEvent::new(KeyCode::Tab, KeyModifiers::NONE)); + + assert_eq!(InputResult::None, result); + assert_eq!("⠋", composer.textarea.text()); + + let result = composer.replace_transcription(&id, "queued voice text"); + + assert_eq!( + InputResult::Queued { + text: "queued voice text".to_string(), + text_elements: Vec::new(), + action: QueuedInputAction::Plain, + }, + result + ); + assert_eq!("", composer.textarea.text()); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn failed_transcription_keeps_draft_and_allows_editing_again() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + composer.textarea.set_text_clearing_elements("draft "); + composer.move_cursor_to_end(); + + let id = composer.insert_recording_meter_placeholder("⠋"); + composer + .spinner_stop_flags + .insert(id.clone(), Arc::new(AtomicBool::new(false))); + + let (result, _needs_redraw) = + composer.handle_key_event(KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); + + assert_eq!(InputResult::None, result); + + composer.remove_transcription_placeholder(&id); + + assert_eq!("draft ", composer.textarea.text()); + + type_chars_humanlike(&mut composer, &['x']); + assert_eq!("draft x", composer.textarea.text()); + } + + #[cfg(not(target_os = "linux"))] + #[test] + fn set_text_content_stops_all_transcription_spinners() { + let (tx, _rx) = unbounded_channel::(); + let sender = AppEventSender::new(tx); + let mut composer = ChatComposer::new( + /*has_input_focus*/ true, + sender, + /*enhanced_keys_supported*/ false, + "Ask Codex to do anything".to_string(), + /*disable_paste_burst*/ false, + ); + + let flag_one = Arc::new(AtomicBool::new(false)); + let flag_two = Arc::new(AtomicBool::new(false)); + composer + .spinner_stop_flags + .insert("voice-1".to_string(), Arc::clone(&flag_one)); + composer + .spinner_stop_flags + .insert("voice-2".to_string(), Arc::clone(&flag_two)); + + composer.set_text_content("draft".to_string(), Vec::new(), Vec::new()); + + assert!(flag_one.load(Ordering::Relaxed)); + assert!(flag_two.load(Ordering::Relaxed)); + assert!(composer.spinner_stop_flags.is_empty()); + } + #[test] fn slash_tab_completion_moves_cursor_to_end() { use crossterm::event::KeyCode; diff --git a/codex-rs/tui/src/bottom_pane/mod.rs b/codex-rs/tui/src/bottom_pane/mod.rs index 02275755b7..e1e6638905 100644 --- a/codex-rs/tui/src/bottom_pane/mod.rs +++ b/codex-rs/tui/src/bottom_pane/mod.rs @@ -411,6 +411,17 @@ impl BottomPane { self.request_redraw(); } + pub fn set_voice_transcription_enabled(&mut self, enabled: bool) { + self.composer.set_voice_transcription_enabled(enabled); + self.request_redraw(); + } + + pub fn set_voice_transcription_space_hold_delay_ms(&mut self, delay_ms: u64) { + self.composer + .set_voice_transcription_space_hold_delay_ms(delay_ms); + self.request_redraw(); + } + /// Update the key hint shown next to queued messages so it matches the /// binding that `ChatWidget` actually listens for. pub(crate) fn set_queued_message_edit_binding(&mut self, binding: Option) { @@ -527,6 +538,15 @@ impl BottomPane { /// Forward a key event to the active view or the composer. pub fn handle_key_event(&mut self, key_event: KeyEvent) -> InputResult { + #[cfg(not(target_os = "linux"))] + if self.composer.is_recording() { + let (_result, needs_redraw) = self.composer.handle_key_event(key_event); + if needs_redraw { + self.request_redraw(); + } + return InputResult::None; + } + // If a modal/view is active, handle it here; otherwise forward to composer. if !self.view_stack.is_empty() { if key_event.kind == KeyEventKind::Release { @@ -691,6 +711,8 @@ impl BottomPane { } fn pre_draw_tick_at(&mut self, now: Instant) { + #[cfg(not(target_os = "linux"))] + self.composer.process_space_hold_trigger(); self.composer.sync_popups(); self.maybe_show_delayed_approval_requests_at(now); } @@ -1544,11 +1566,40 @@ impl BottomPane { updated } + pub(crate) fn show_transcription_retrying( + &mut self, + id: &str, + attempt: usize, + max_attempts: usize, + ) -> bool { + let updated = self + .composer + .show_transcription_retrying(id, attempt, max_attempts); + if updated { + self.composer.sync_popups(); + self.request_redraw(); + } + updated + } + + pub(crate) fn replace_transcription(&mut self, id: &str, text: &str) -> InputResult { + let result = self.composer.replace_transcription(id, text); + self.composer.sync_popups(); + self.request_redraw(); + result + } + pub(crate) fn remove_recording_meter_placeholder(&mut self, id: &str) { self.composer.remove_recording_meter_placeholder(id); self.composer.sync_popups(); self.request_redraw(); } + + pub(crate) fn remove_transcription_placeholder(&mut self, id: &str) { + self.composer.remove_transcription_placeholder(id); + self.composer.sync_popups(); + self.request_redraw(); + } } impl Renderable for BottomPane { diff --git a/codex-rs/tui/src/bottom_pane/textarea.rs b/codex-rs/tui/src/bottom_pane/textarea.rs index 7ab6d38fca..fa053c5a43 100644 --- a/codex-rs/tui/src/bottom_pane/textarea.rs +++ b/codex-rs/tui/src/bottom_pane/textarea.rs @@ -862,7 +862,6 @@ impl TextArea { self.set_cursor(end); } - #[cfg(not(target_os = "linux"))] pub fn replace_element_by_id(&mut self, id: &str, text: &str) -> bool { if let Some(idx) = self .elements diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs index e47591a275..eaba0aea57 100644 --- a/codex-rs/tui/src/chatwidget.rs +++ b/codex-rs/tui/src/chatwidget.rs @@ -5539,6 +5539,14 @@ impl ChatWidget { if let Some(keymap) = runtime_keymap { widget.bottom_pane.set_keymap_bindings(&keymap); } + widget.bottom_pane.set_voice_transcription_enabled( + widget.config.features.enabled(Feature::VoiceTranscription), + ); + widget + .bottom_pane + .set_voice_transcription_space_hold_delay_ms( + widget.config.voice_transcription_space_hold_delay_ms, + ); widget .bottom_pane .set_realtime_conversation_enabled(widget.realtime_conversation_enabled()); @@ -5720,77 +5728,8 @@ impl ChatWidget { } _ => { let had_modal_or_popup = !self.bottom_pane.no_modal_or_popup_active(); - match self.bottom_pane.handle_key_event(key_event) { - InputResult::Submitted { - text, - text_elements, - } => { - let local_images = self - .bottom_pane - .take_recent_submission_images_with_placeholders(); - let remote_image_urls = self.take_remote_image_urls(); - let user_message = UserMessage { - text, - local_images, - remote_image_urls, - text_elements, - mention_bindings: self - .bottom_pane - .take_recent_submission_mention_bindings(), - }; - if user_message.text.is_empty() - && user_message.local_images.is_empty() - && user_message.remote_image_urls.is_empty() - { - return; - } - let should_submit_now = - self.is_session_configured() && !self.is_plan_streaming_in_tui(); - if should_submit_now { - if self.only_user_shell_commands_running() - && !user_message.text.starts_with('!') - { - self.queue_user_message(user_message); - return; - } - // Submitted is emitted when user submits. - // Reset any reasoning header only when we are actually submitting a turn. - self.reasoning_buffer.clear(); - self.full_reasoning_buffer.clear(); - self.set_status_header(String::from("Working")); - self.submit_user_message(user_message); - } else { - self.queue_user_message(user_message); - } - } - InputResult::Queued { - text, - text_elements, - action, - } => { - let local_images = self - .bottom_pane - .take_recent_submission_images_with_placeholders(); - let remote_image_urls = self.take_remote_image_urls(); - let user_message = UserMessage { - text, - local_images, - remote_image_urls, - text_elements, - mention_bindings: self - .bottom_pane - .take_recent_submission_mention_bindings(), - }; - self.queue_user_message_with_options(user_message, action); - } - InputResult::Command(cmd) => { - self.handle_slash_command_dispatch(cmd); - } - InputResult::CommandWithArgs(cmd, args, text_elements) => { - self.handle_slash_command_with_args_dispatch(cmd, args, text_elements); - } - InputResult::None => {} - } + let input_result = self.bottom_pane.handle_key_event(key_event); + self.handle_bottom_pane_input_result(input_result); if had_modal_or_popup && self.bottom_pane.no_modal_or_popup_active() { self.maybe_send_next_queued_input(); } @@ -5800,6 +5739,74 @@ impl ChatWidget { self.maybe_signal_watchdog_owner_activity_if_draft_changed(&composer_before); } + fn user_message_from_composer_submission( + &mut self, + text: String, + text_elements: Vec, + ) -> UserMessage { + UserMessage { + text, + local_images: self + .bottom_pane + .take_recent_submission_images_with_placeholders(), + remote_image_urls: self.take_remote_image_urls(), + text_elements, + mention_bindings: self.bottom_pane.take_recent_submission_mention_bindings(), + } + } + + fn handle_bottom_pane_input_result(&mut self, input_result: InputResult) { + let composer_before = self.bottom_pane.composer_text_with_pending(); + match input_result { + InputResult::Submitted { + text, + text_elements, + } => { + let user_message = self.user_message_from_composer_submission(text, text_elements); + if user_message.text.is_empty() + && user_message.local_images.is_empty() + && user_message.remote_image_urls.is_empty() + { + return; + } + let should_submit_now = + self.is_session_configured() && !self.is_plan_streaming_in_tui(); + if should_submit_now { + if self.only_user_shell_commands_running() + && !user_message.text.starts_with('!') + { + self.queue_user_message(user_message); + return; + } + // Submitted is emitted when user submits. + // Reset any reasoning header only when we are actually submitting a turn. + self.reasoning_buffer.clear(); + self.full_reasoning_buffer.clear(); + self.set_status_header(String::from("Working")); + self.submit_user_message(user_message); + } else { + self.queue_user_message(user_message); + } + } + InputResult::Queued { + text, + text_elements, + action, + } => { + let user_message = self.user_message_from_composer_submission(text, text_elements); + self.queue_user_message_with_options(user_message, action); + } + InputResult::Command(cmd) => { + self.handle_slash_command_dispatch(cmd); + } + InputResult::CommandWithArgs(cmd, args, text_elements) => { + self.handle_slash_command_with_args_dispatch(cmd, args, text_elements); + } + InputResult::None => {} + } + self.maybe_signal_watchdog_owner_activity_if_draft_changed(&composer_before); + } + /// Attach a local image to the composer when the active model supports image inputs. /// /// When the model does not advertise image support, we keep the draft unchanged and surface a @@ -12137,6 +12144,13 @@ impl ChatWidget { #[cfg(not(target_os = "linux"))] impl ChatWidget { + pub(crate) fn replace_transcription(&mut self, id: &str, text: &str) { + let input_result = self.bottom_pane.replace_transcription(id, text); + self.handle_bottom_pane_input_result(input_result); + // Ensure the UI redraws to reflect the updated transcription. + self.request_redraw(); + } + pub(crate) fn update_recording_meter_in_place(&mut self, id: &str, text: &str) -> bool { let updated = self.bottom_pane.update_recording_meter_in_place(id, text); if updated { @@ -12145,11 +12159,33 @@ impl ChatWidget { updated } + pub(crate) fn show_transcription_retrying( + &mut self, + id: &str, + attempt: usize, + max_attempts: usize, + ) { + let updated = self + .bottom_pane + .show_transcription_retrying(id, attempt, max_attempts); + if updated { + self.request_redraw(); + } + } + pub(crate) fn remove_recording_meter_placeholder(&mut self, id: &str) { self.bottom_pane.remove_recording_meter_placeholder(id); // Ensure the UI redraws to reflect placeholder removal. self.request_redraw(); } + + pub(crate) fn fail_transcription(&mut self, id: &str, error: &str) { + self.bottom_pane.remove_transcription_placeholder(id); + self.add_to_history(history_cell::new_error_event(format!( + "Voice transcription failed: {error}" + ))); + self.request_redraw(); + } } fn has_websocket_timing_metrics(summary: RuntimeMetricsSummary) -> bool { diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index bfeaff2eae..3a41796be4 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -607,7 +607,7 @@ impl ChatWidget { flag.store(true, Ordering::Relaxed); } if let Some(capture) = self.realtime_conversation.capture.take() { - capture.stop(); + let _ = capture.stop(); } if let Some(id) = self.realtime_conversation.meter_placeholder_id.take() { self.remove_recording_meter_placeholder(&id); diff --git a/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs b/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs index 135d8dbaa4..015ae0f77c 100644 --- a/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs +++ b/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs @@ -42,6 +42,27 @@ async fn deleted_realtime_meter_uses_shared_stop_path() { ); } +#[cfg(not(target_os = "linux"))] +#[tokio::test] +async fn transcription_failure_keeps_draft_and_renders_error() { + let (mut chat, mut rx, _op_rx) = make_chatwidget_manual(/*model_override*/ None).await; + chat.bottom_pane + .set_composer_text("draft ".to_string(), Vec::new(), Vec::new()); + let placeholder_id = chat.bottom_pane.insert_recording_meter_placeholder("⠋"); + + chat.fail_transcription(&placeholder_id, "boom"); + + assert_eq!("draft ", chat.bottom_pane.composer_text()); + let rendered = drain_insert_history(&mut rx) + .into_iter() + .map(|lines| lines_to_single_string(&lines)) + .collect::>(); + insta::assert_snapshot!( + rendered.join("\n\n"), + @"■ Voice transcription failed: boom" + ); +} + #[tokio::test] async fn experimental_mode_plan_is_ignored_on_startup() { let codex_home = tempdir().expect("tempdir"); diff --git a/codex-rs/tui/src/voice.rs b/codex-rs/tui/src/voice.rs index 229d0a8db5..1ee14b1705 100644 --- a/codex-rs/tui/src/voice.rs +++ b/codex-rs/tui/src/voice.rs @@ -1,33 +1,152 @@ +use crate::app_event::AppEvent; use crate::app_event_sender::AppEventSender; +use crate::audio_device::preferred_input_config; use crate::legacy_core::config::Config; +use crate::legacy_core::config::find_codex_home; use base64::Engine; +use codex_app_server_protocol::AuthMode; +use codex_client::build_reqwest_client_with_custom_ca; +use codex_config::types::AuthCredentialsStoreMode; +use codex_login::CodexAuth; +use codex_login::default_client::get_codex_user_agent; use codex_protocol::protocol::ConversationAudioParams; use codex_protocol::protocol::RealtimeAudioFrame; use cpal::traits::DeviceTrait; +use cpal::traits::HostTrait; use cpal::traits::StreamTrait; +use hound::SampleFormat; +use hound::WavSpec; +use hound::WavWriter; use std::collections::VecDeque; +use std::fmt; +use std::future::Future; +use std::io::Cursor; use std::sync::Arc; use std::sync::Mutex; use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicU16; use std::sync::atomic::Ordering; +use std::time::Duration; +use std::time::Instant; use tracing::error; +use tracing::info; +use tracing::trace; +use tracing::warn; +const AUDIO_MODEL: &str = "gpt-4o-mini-transcribe"; const MODEL_AUDIO_SAMPLE_RATE: u32 = 24_000; const MODEL_AUDIO_CHANNELS: u16 = 1; +const FIRST_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT: Duration = Duration::from_secs(2); +const FIRST_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT: Duration = Duration::from_secs(15); +const SECOND_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT: Duration = Duration::from_secs(4); +const SECOND_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT: Duration = Duration::from_secs(30); +const FINAL_TRANSCRIPTION_ATTEMPT_TIMEOUT: Duration = Duration::from_secs(60); +const FIRST_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND: f32 = 2.0; +const SECOND_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND: f32 = 3.0; +const TRANSCRIPTION_ATTEMPT_COUNT: usize = 3; + +struct TranscriptionAuthContext { + mode: AuthMode, + bearer_token: String, + chatgpt_account_id: Option, + chatgpt_base_url: String, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +struct TranscriptionAttempt { + number: usize, + timeout: Duration, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +struct TranscriptionRetryNotice { + next_attempt: usize, + max_attempts: usize, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum TranscriptionRetryDecision { + Retry { delay: Option }, + Stop, +} + +#[derive(Debug)] +enum TranscriptionRequestError { + Build(String), + Timeout(Duration), + Send(String), + Status { + status: reqwest::StatusCode, + body: String, + retry_after: Option, + }, + Json(String), +} + +impl fmt::Display for TranscriptionRequestError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Build(error) => write!(f, "{error}"), + Self::Timeout(timeout) => write!( + f, + "transcription request timed out after {:.2}s", + timeout.as_secs_f32() + ), + Self::Send(error) => write!(f, "transcription request failed: {error}"), + Self::Status { status, body, .. } => { + write!(f, "transcription failed: {status} {body}") + } + Self::Json(error) => write!(f, "failed to parse json: {error}"), + } + } +} + +pub struct RecordedAudio { + pub data: Vec, + pub sample_rate: u32, + pub channels: u16, +} pub struct VoiceCapture { stream: Option, + sample_rate: u32, + channels: u16, + data: Arc>>, stopped: Arc, last_peak: Arc, } impl VoiceCapture { + pub fn start() -> Result { + let (device, config) = select_default_input_device_and_config()?; + + let sample_rate = config.sample_rate().0; + let channels = config.channels(); + let data: Arc>> = Arc::new(Mutex::new(Vec::new())); + let stopped = Arc::new(AtomicBool::new(false)); + let last_peak = Arc::new(AtomicU16::new(0)); + + let stream = build_input_stream(&device, &config, data.clone(), last_peak.clone())?; + stream + .play() + .map_err(|e| format!("failed to start input stream: {e}"))?; + + Ok(Self { + stream: Some(stream), + sample_rate, + channels, + data, + stopped, + last_peak, + }) + } + pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result { let (device, config) = select_realtime_input_device_and_config(config)?; let sample_rate = config.sample_rate().0; let channels = config.channels(); + let data: Arc>> = Arc::new(Mutex::new(Vec::new())); let stopped = Arc::new(AtomicBool::new(false)); let last_peak = Arc::new(AtomicU16::new(0)); @@ -45,22 +164,47 @@ impl VoiceCapture { Ok(Self { stream: Some(stream), + sample_rate, + channels, + data, stopped, last_peak, }) } - pub fn stop(mut self) { + pub fn stop(mut self) -> Result { // Mark stopped so any metering task can exit cleanly. self.stopped.store(true, Ordering::SeqCst); // Dropping the stream stops capture. self.stream.take(); + let data = self + .data + .lock() + .map_err(|_| "failed to lock audio buffer".to_string())? + .clone(); + Ok(RecordedAudio { + data, + sample_rate: self.sample_rate, + channels: self.channels, + }) + } + + pub fn data_arc(&self) -> Arc>> { + self.data.clone() } pub fn stopped_flag(&self) -> Arc { self.stopped.clone() } + pub fn sample_rate(&self) -> u32 { + self.sample_rate + } + + pub fn channels(&self) -> u16 { + self.channels + } + pub fn last_peak_arc(&self) -> Arc { self.last_peak.clone() } @@ -125,16 +269,147 @@ impl RecordingMeterState { } } +pub fn transcribe_async( + id: String, + audio: RecordedAudio, + context: Option, + tx: AppEventSender, +) { + std::thread::spawn(move || { + const MIN_DURATION_SECONDS: f32 = 1.0; + let duration_seconds = clip_duration_seconds(&audio); + if duration_seconds < MIN_DURATION_SECONDS { + let msg = format!( + "recording too short ({duration_seconds:.2}s); minimum is {MIN_DURATION_SECONDS:.2}s" + ); + info!("{msg}"); + tx.send(AppEvent::TranscriptionFailed { id, error: msg }); + return; + } + + let wav_bytes = match encode_wav_normalized(&audio) { + Ok(wav_bytes) => wav_bytes, + Err(err) => { + error!("failed to encode wav: {err}"); + tx.send(AppEvent::TranscriptionFailed { id, error: err }); + return; + } + }; + + let runtime = match tokio::runtime::Runtime::new() { + Ok(runtime) => runtime, + Err(err) => { + error!("failed to create tokio runtime: {err}"); + tx.send(AppEvent::TranscriptionFailed { + id, + error: err.to_string(), + }); + return; + } + }; + + let retry_tx = tx.clone(); + let retry_id = id.clone(); + let on_retry = move |notice: TranscriptionRetryNotice| { + retry_tx.send(AppEvent::TranscriptionRetrying { + id: retry_id.clone(), + attempt: notice.next_attempt, + max_attempts: notice.max_attempts, + }); + }; + + match runtime.block_on(transcribe_bytes( + wav_bytes, + context, + duration_seconds, + on_retry, + )) { + Ok(text) => { + tx.send(AppEvent::TranscriptionComplete { id, text }); + info!("voice transcription succeeded"); + } + Err(err) => { + error!("voice transcription error: {err}"); + tx.send(AppEvent::TranscriptionFailed { id, error: err }); + } + } + }); +} + // ------------------------- // Voice input helpers // ------------------------- +fn select_default_input_device_and_config() +-> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { + let host = cpal::default_host(); + let device = host + .default_input_device() + .ok_or_else(|| "no input audio device available".to_string())?; + let config = preferred_input_config(&device)?; + Ok((device, config)) +} + fn select_realtime_input_device_and_config( config: &Config, ) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { crate::audio_device::select_configured_input_device_and_config(config) } +fn build_input_stream( + device: &cpal::Device, + config: &cpal::SupportedStreamConfig, + data: Arc>>, + last_peak: Arc, +) -> Result { + match config.sample_format() { + cpal::SampleFormat::F32 => device + .build_input_stream( + &config.clone().into(), + move |input: &[f32], _| { + let peak = peak_f32(input); + last_peak.store(peak, Ordering::Relaxed); + if let Ok(mut buffer) = data.lock() { + for &sample in input { + buffer.push(f32_to_i16(sample)); + } + } + }, + move |err| error!("audio input error: {err}"), + None, + ) + .map_err(|e| format!("failed to build input stream: {e}")), + cpal::SampleFormat::I16 => device + .build_input_stream( + &config.clone().into(), + move |input: &[i16], _| { + let peak = peak_i16(input); + last_peak.store(peak, Ordering::Relaxed); + if let Ok(mut buffer) = data.lock() { + buffer.extend_from_slice(input); + } + }, + move |err| error!("audio input error: {err}"), + None, + ) + .map_err(|e| format!("failed to build input stream: {e}")), + cpal::SampleFormat::U16 => device + .build_input_stream( + &config.clone().into(), + move |input: &[u16], _| { + if let Ok(mut buffer) = data.lock() { + let peak = convert_u16_to_i16_and_peak(input, &mut buffer); + last_peak.store(peak, Ordering::Relaxed); + } + }, + move |err| error!("audio input error: {err}"), + None, + ) + .map_err(|e| format!("failed to build input stream: {e}")), + _ => Err("unsupported input sample format".to_string()), + } +} + fn build_realtime_input_stream( device: &cpal::Device, config: &cpal::SupportedStreamConfig, @@ -472,10 +747,467 @@ fn convert_pcm16( out } +// ------------------------- +// Transcription helpers +// ------------------------- + +fn clip_duration_seconds(audio: &RecordedAudio) -> f32 { + let total_samples = audio.data.len() as f32; + let samples_per_second = (audio.sample_rate as f32) * (audio.channels as f32); + if samples_per_second > 0.0 { + total_samples / samples_per_second + } else { + 0.0 + } +} + +fn encode_wav_normalized(audio: &RecordedAudio) -> Result, String> { + let converted; + let (channels, sample_rate, segment) = + if audio.channels == MODEL_AUDIO_CHANNELS && audio.sample_rate == MODEL_AUDIO_SAMPLE_RATE { + (audio.channels, audio.sample_rate, audio.data.as_slice()) + } else { + converted = convert_pcm16( + &audio.data, + audio.sample_rate, + audio.channels, + MODEL_AUDIO_SAMPLE_RATE, + MODEL_AUDIO_CHANNELS, + ); + ( + MODEL_AUDIO_CHANNELS, + MODEL_AUDIO_SAMPLE_RATE, + converted.as_slice(), + ) + }; + + let spec = WavSpec { + channels, + sample_rate, + bits_per_sample: 16, + sample_format: SampleFormat::Int, + }; + let mut wav_bytes = Vec::new(); + let mut cursor = Cursor::new(&mut wav_bytes); + let mut writer = + WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?; + + let peak_abs = segment + .iter() + .map(|sample| (i32::from(*sample)).unsigned_abs() as i32) + .max() + .unwrap_or(0); + let target = (i16::MAX as f32) * 0.9; + let gain = if peak_abs > 0 { + target / (peak_abs as f32) + } else { + 1.0 + }; + + for &sample in segment { + let normalized = ((sample as f32) * gain) + .round() + .clamp(i16::MIN as f32, i16::MAX as f32) as i16; + writer + .write_sample(normalized) + .map_err(|_| "failed writing wav sample".to_string())?; + } + writer + .finalize() + .map_err(|_| "failed to finalize wav".to_string())?; + Ok(wav_bytes) +} + +fn normalize_chatgpt_base_url(input: &str) -> String { + let mut base_url = input.to_string(); + while base_url.ends_with('/') { + base_url.pop(); + } + if (base_url.starts_with("https://chatgpt.com") + || base_url.starts_with("https://chat.openai.com")) + && !base_url.contains("/backend-api") + { + base_url = format!("{base_url}/backend-api"); + } + base_url +} + +async fn resolve_auth() -> Result { + let codex_home = find_codex_home().map_err(|e| format!("failed to find codex home: {e}"))?; + let auth = CodexAuth::from_auth_storage(&codex_home, AuthCredentialsStoreMode::Auto) + .await + .map_err(|e| format!("failed to read auth.json: {e}"))? + .ok_or_else(|| "No Codex auth is configured; please run `codex login`".to_string())?; + + let chatgpt_account_id = auth.get_account_id(); + let bearer_token = auth + .get_token() + .map_err(|e| format!("failed to get auth token: {e}"))?; + let config = Config::load_with_cli_overrides(Vec::new()) + .await + .map_err(|e| format!("failed to load config: {e}"))?; + Ok(TranscriptionAuthContext { + mode: auth.api_auth_mode(), + bearer_token, + chatgpt_account_id, + chatgpt_base_url: normalize_chatgpt_base_url(&config.chatgpt_base_url), + }) +} + +async fn transcribe_bytes( + wav_bytes: Vec, + context: Option, + duration_seconds: f32, + on_retry: impl Fn(TranscriptionRetryNotice), +) -> Result { + let started_at = Instant::now(); + let auth = resolve_auth().await?; + let auth_elapsed = started_at.elapsed(); + let client = build_reqwest_client_with_custom_ca(reqwest::Client::builder()) + .map_err(|error| format!("failed to build transcription HTTP client: {error}"))?; + let audio_bytes = wav_bytes.len(); + let prompt_for_log = context.as_deref().unwrap_or("").to_string(); + let audio_kib = audio_bytes as f32 / 1024.0; + let mode = auth.mode; + trace!( + "preparing transcription request: mode={mode:?} duration={duration_seconds:.2}s audio={audio_kib:.1}KiB prompt={prompt_for_log}" + ); + let value = send_transcription_request_with_retries( + &client, + &auth, + &wav_bytes, + context.as_deref(), + TranscriptionRequestMetrics { + mode, + duration_seconds, + audio_kib, + auth_elapsed, + started_at, + }, + on_retry, + ) + .await + .map_err(|error| error.to_string())?; + + let text = value + .get("text") + .and_then(|text| text.as_str()) + .unwrap_or("") + .to_string(); + + if text.is_empty() { + Err("empty transcription result".to_string()) + } else { + Ok(text) + } +} + +#[derive(Clone, Copy)] +struct TranscriptionRequestMetrics { + mode: AuthMode, + duration_seconds: f32, + audio_kib: f32, + auth_elapsed: Duration, + started_at: Instant, +} + +fn transcription_request_attempts(duration_seconds: f32) -> [TranscriptionAttempt; 3] { + [ + TranscriptionAttempt { + number: 1, + timeout: scaled_transcription_request_timeout( + duration_seconds, + FIRST_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT, + FIRST_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT, + FIRST_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND, + ), + }, + TranscriptionAttempt { + number: 2, + timeout: scaled_transcription_request_timeout( + duration_seconds, + SECOND_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT, + SECOND_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT, + SECOND_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND, + ), + }, + TranscriptionAttempt { + number: 3, + timeout: FINAL_TRANSCRIPTION_ATTEMPT_TIMEOUT, + }, + ] +} + +fn scaled_transcription_request_timeout( + duration_seconds: f32, + min_timeout: Duration, + max_timeout: Duration, + timeout_per_audio_second: f32, +) -> Duration { + let scaled_timeout = if duration_seconds.is_finite() && duration_seconds > 0.0 { + Duration::from_secs_f32(duration_seconds * timeout_per_audio_second) + } else { + min_timeout + }; + + scaled_timeout.clamp(min_timeout, max_timeout) +} + +fn build_transcription_request( + client: &reqwest::Client, + auth: &TranscriptionAuthContext, + wav_bytes: &[u8], + context: Option<&str>, +) -> Result<(String, reqwest::RequestBuilder), TranscriptionRequestError> { + if matches!(auth.mode, AuthMode::Chatgpt | AuthMode::ChatgptAuthTokens) { + let part = reqwest::multipart::Part::bytes(wav_bytes.to_vec()) + .file_name("audio.wav") + .mime_str("audio/wav") + .map_err(|error| { + TranscriptionRequestError::Build(format!("failed to set mime: {error}")) + })?; + let form = reqwest::multipart::Form::new().part("file", part); + let endpoint = format!("{}/transcribe", auth.chatgpt_base_url); + let request = if let Some(account_id) = &auth.chatgpt_account_id { + client + .post(&endpoint) + .bearer_auth(&auth.bearer_token) + .multipart(form) + .header("User-Agent", get_codex_user_agent()) + .header("ChatGPT-Account-Id", account_id.as_str()) + } else { + client + .post(&endpoint) + .bearer_auth(&auth.bearer_token) + .multipart(form) + .header("User-Agent", get_codex_user_agent()) + }; + Ok((endpoint, request)) + } else { + let part = reqwest::multipart::Part::bytes(wav_bytes.to_vec()) + .file_name("audio.wav") + .mime_str("audio/wav") + .map_err(|error| { + TranscriptionRequestError::Build(format!("failed to set mime: {error}")) + })?; + let form = if let Some(context) = context { + reqwest::multipart::Form::new() + .text("model", AUDIO_MODEL) + .part("file", part) + .text("prompt", context.to_string()) + } else { + reqwest::multipart::Form::new() + .text("model", AUDIO_MODEL) + .part("file", part) + }; + let endpoint = "https://api.openai.com/v1/audio/transcriptions".to_string(); + Ok(( + endpoint.clone(), + client + .post(&endpoint) + .bearer_auth(&auth.bearer_token) + .multipart(form) + .header("User-Agent", get_codex_user_agent()), + )) + } +} + +async fn send_transcription_request_with_retries( + client: &reqwest::Client, + auth: &TranscriptionAuthContext, + wav_bytes: &[u8], + context: Option<&str>, + metrics: TranscriptionRequestMetrics, + on_retry: impl Fn(TranscriptionRetryNotice), +) -> Result { + let attempts = transcription_request_attempts(metrics.duration_seconds); + let mut last_error = None; + + for attempt_index in 0..attempts.len() { + let attempt = attempts[attempt_index]; + let next_attempt = attempts.get(attempt_index + 1).copied(); + let (endpoint, request) = build_transcription_request(client, auth, wav_bytes, context)?; + info!( + "sending voice transcription request: mode={:?} endpoint={endpoint} attempt={}/{} duration={:.2}s audio={:.1}KiB timeout={:.2}s auth_config_elapsed_ms={}", + metrics.mode, + attempt.number, + TRANSCRIPTION_ATTEMPT_COUNT, + metrics.duration_seconds, + metrics.audio_kib, + attempt.timeout.as_secs_f32(), + metrics.auth_elapsed.as_millis() + ); + + let request_started_at = Instant::now(); + match send_transcription_request_with_timeout(request, attempt.timeout).await { + Ok(value) => { + let request_elapsed = request_started_at.elapsed(); + info!( + "voice transcription response parsed: attempt={}/{} request_elapsed_ms={} total_elapsed_ms={}", + attempt.number, + TRANSCRIPTION_ATTEMPT_COUNT, + request_elapsed.as_millis(), + metrics.started_at.elapsed().as_millis() + ); + return Ok(value); + } + Err(error) => { + let request_elapsed = request_started_at.elapsed(); + match transcription_retry_decision(&error, next_attempt) { + TranscriptionRetryDecision::Retry { delay } => { + warn!( + "voice transcription attempt failed; retrying: attempt={}/{} request_elapsed_ms={} total_elapsed_ms={} error={error}", + attempt.number, + TRANSCRIPTION_ATTEMPT_COUNT, + request_elapsed.as_millis(), + metrics.started_at.elapsed().as_millis() + ); + on_retry(TranscriptionRetryNotice { + next_attempt: attempt.number + 1, + max_attempts: TRANSCRIPTION_ATTEMPT_COUNT, + }); + if let Some(delay) = delay { + info!( + "waiting before voice transcription retry: retry_after_ms={}", + delay.as_millis() + ); + tokio::time::sleep(delay).await; + } + last_error = Some(error); + } + TranscriptionRetryDecision::Stop => { + warn!( + "voice transcription attempt failed; giving up: attempt={}/{} request_elapsed_ms={} total_elapsed_ms={} error={error}", + attempt.number, + TRANSCRIPTION_ATTEMPT_COUNT, + request_elapsed.as_millis(), + metrics.started_at.elapsed().as_millis() + ); + return Err(error); + } + } + } + } + } + + Err(last_error.unwrap_or_else(|| { + TranscriptionRequestError::Build("no transcription attempts configured".to_string()) + })) +} + +fn transcription_retry_decision( + error: &TranscriptionRequestError, + next_attempt: Option, +) -> TranscriptionRetryDecision { + let Some(next_attempt) = next_attempt else { + return TranscriptionRetryDecision::Stop; + }; + + match error { + TranscriptionRequestError::Timeout(_) | TranscriptionRequestError::Send(_) => { + TranscriptionRetryDecision::Retry { delay: None } + } + TranscriptionRequestError::Status { status, .. } + if matches!( + *status, + reqwest::StatusCode::BAD_GATEWAY + | reqwest::StatusCode::SERVICE_UNAVAILABLE + | reqwest::StatusCode::GATEWAY_TIMEOUT + ) => + { + TranscriptionRetryDecision::Retry { delay: None } + } + TranscriptionRequestError::Status { + status, + retry_after, + .. + } if *status == reqwest::StatusCode::TOO_MANY_REQUESTS => match retry_after { + Some(delay) if *delay <= next_attempt.timeout => TranscriptionRetryDecision::Retry { + delay: Some(*delay), + }, + Some(_) => TranscriptionRetryDecision::Stop, + None => TranscriptionRetryDecision::Retry { delay: None }, + }, + TranscriptionRequestError::Build(_) + | TranscriptionRequestError::Status { .. } + | TranscriptionRequestError::Json(_) => TranscriptionRetryDecision::Stop, + } +} + +fn retry_after_duration(headers: &reqwest::header::HeaderMap) -> Option { + headers + .get(reqwest::header::RETRY_AFTER) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .map(Duration::from_secs) +} + +async fn send_transcription_request_with_timeout( + request: reqwest::RequestBuilder, + timeout: Duration, +) -> Result { + // Use an explicit async deadline because reqwest otherwise has no end-to-end request timeout + // on this client builder. + with_transcription_timeout(send_transcription_request(request), timeout).await +} + +async fn with_transcription_timeout( + future: F, + timeout: Duration, +) -> Result +where + F: Future>, +{ + match tokio::time::timeout(timeout, future).await { + Ok(result) => result, + Err(_) => Err(TranscriptionRequestError::Timeout(timeout)), + } +} + +async fn send_transcription_request( + request: reqwest::RequestBuilder, +) -> Result { + let response = request + .send() + .await + .map_err(|error| TranscriptionRequestError::Send(error.to_string()))?; + if !response.status().is_success() { + let status = response.status(); + let retry_after = retry_after_duration(response.headers()); + let body = response + .text() + .await + .unwrap_or_else(|_| "".to_string()); + return Err(TranscriptionRequestError::Status { + status, + body, + retry_after, + }); + } + + let value: serde_json::Value = response + .json() + .await + .map_err(|error| TranscriptionRequestError::Json(error.to_string()))?; + Ok(value) +} + #[cfg(test)] mod tests { + use super::RecordedAudio; + use super::TranscriptionAttempt; + use super::TranscriptionRequestError; + use super::TranscriptionRetryDecision; use super::convert_pcm16; + use super::encode_wav_normalized; + use super::send_transcription_request_with_timeout; + use super::transcription_request_attempts; + use super::transcription_retry_decision; use pretty_assertions::assert_eq; + use std::io::Cursor; + use std::net::Ipv4Addr; + use std::time::Duration; + use tokio::time; #[test] fn convert_pcm16_downmixes_and_resamples_for_model_input() { @@ -486,4 +1218,182 @@ mod tests { ); assert_eq!(converted, vec![200, 700]); } + + #[test] + fn encode_wav_normalized_outputs_24khz_mono_audio() { + let audio = RecordedAudio { + data: vec![100, -100, 200, -200], + sample_rate: 48_000, + channels: 2, + }; + + let bytes = encode_wav_normalized(&audio).unwrap(); + let reader = hound::WavReader::new(Cursor::new(bytes)).unwrap(); + let spec = reader.spec(); + + assert_eq!(spec.channels, 1); + assert_eq!(spec.sample_rate, 24_000); + assert_eq!(spec.bits_per_sample, 16); + } + + #[test] + fn transcription_request_attempts_scale_with_audio_duration() { + let actual = [0.0, f32::NAN, 1.0, 5.0, 10.0, 20.0] + .into_iter() + .map(|duration_seconds| { + transcription_request_attempts(duration_seconds).map(|attempt| attempt.timeout) + }) + .collect::>(); + + assert_eq!( + actual, + vec![ + [ + Duration::from_secs(2), + Duration::from_secs(4), + Duration::from_secs(60), + ], + [ + Duration::from_secs(2), + Duration::from_secs(4), + Duration::from_secs(60), + ], + [ + Duration::from_secs(2), + Duration::from_secs(4), + Duration::from_secs(60), + ], + [ + Duration::from_secs(10), + Duration::from_secs(15), + Duration::from_secs(60), + ], + [ + Duration::from_secs(15), + Duration::from_secs(30), + Duration::from_secs(60), + ], + [ + Duration::from_secs(15), + Duration::from_secs(30), + Duration::from_secs(60), + ], + ] + ); + } + + #[test] + fn transcription_retry_decision_retries_only_transient_failures() { + let next_attempt = Some(TranscriptionAttempt { + number: 2, + timeout: Duration::from_secs(4), + }); + + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Timeout(Duration::from_secs(2)), + next_attempt + ), + TranscriptionRetryDecision::Retry { delay: None } + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Send("connection reset".to_string()), + next_attempt + ), + TranscriptionRetryDecision::Retry { delay: None } + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Status { + status: reqwest::StatusCode::BAD_GATEWAY, + body: "bad gateway".to_string(), + retry_after: None, + }, + next_attempt + ), + TranscriptionRetryDecision::Retry { delay: None } + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Status { + status: reqwest::StatusCode::TOO_MANY_REQUESTS, + body: "slow down".to_string(), + retry_after: Some(Duration::from_secs(3)), + }, + next_attempt + ), + TranscriptionRetryDecision::Retry { + delay: Some(Duration::from_secs(3)) + } + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Status { + status: reqwest::StatusCode::TOO_MANY_REQUESTS, + body: "slow down".to_string(), + retry_after: Some(Duration::from_secs(5)), + }, + next_attempt + ), + TranscriptionRetryDecision::Stop + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Status { + status: reqwest::StatusCode::UNAUTHORIZED, + body: "no".to_string(), + retry_after: None, + }, + next_attempt + ), + TranscriptionRetryDecision::Stop + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Json("invalid".to_string()), + next_attempt + ), + TranscriptionRetryDecision::Stop + ); + assert_eq!( + transcription_retry_decision( + &TranscriptionRequestError::Timeout(Duration::from_secs(60)), + None + ), + TranscriptionRetryDecision::Stop + ); + } + + #[tokio::test(flavor = "current_thread", start_paused = true)] + async fn transcription_request_times_out_unresponsive_endpoint() { + let listener = tokio::net::TcpListener::bind((Ipv4Addr::LOCALHOST, 0)) + .await + .unwrap(); + let url = format!("http://{}/transcribe", listener.local_addr().unwrap()); + let server = tokio::spawn(async move { + let (_socket, _) = listener.accept().await.unwrap(); + std::future::pending::<()>().await; + }); + + let timeout = Duration::from_secs(10); + let task = tokio::spawn(send_transcription_request_with_timeout( + reqwest::Client::new().get(url), + timeout, + )); + tokio::task::yield_now().await; + time::advance(timeout).await; + + let err = time::timeout(Duration::from_millis(1), task) + .await + .unwrap() + .unwrap() + .unwrap_err(); + server.abort(); + + assert_eq!( + err.to_string(), + "transcription request timed out after 10.00s" + ); + } } diff --git a/docs/tui-chat-composer.md b/docs/tui-chat-composer.md index 0ad5c693b3..f4b02cc7e6 100644 --- a/docs/tui-chat-composer.md +++ b/docs/tui-chat-composer.md @@ -84,6 +84,7 @@ Flags: - `popups_enabled` - `slash_commands_enabled` - `image_paste_enabled` +- `voice_transcription_space_hold_delay_ms` Key effects when disabled: @@ -91,6 +92,8 @@ Key effects when disabled: - When `slash_commands_enabled` is `false`, the composer does not treat `/...` input as commands. - When `slash_commands_enabled` is `false`, slash-context paste-burst exceptions are disabled. - When `image_paste_enabled` is `false`, file-path paste image attachment is skipped. +- `voice_transcription_space_hold_delay_ms` only affects the non-empty-composer Space hold path; + empty composers still start voice capture immediately. - `ChatWidget` may toggle `image_paste_enabled` at runtime based on the selected model's `input_modalities`; attach and submit paths also re-check support and emit a warning instead of dropping the draft. @@ -120,6 +123,14 @@ the input starts with `!` (shell command). The same preparation path is reused for slash commands with arguments (for example `/plan` and `/review`) so pasted content and text elements are preserved when extracting args. +If a voice transcription placeholder is still resolving, `handle_submission` records whether the +user pressed Enter (submit) or Tab (queue while a task is running, submit otherwise) and leaves the +draft visible. While that pending transcription submission exists, key edits are ignored so the +submitted draft cannot drift from what the user committed. When transcription completes, the +placeholder is replaced with the produced text and the normal `prepare_submission_text` path runs. +If transcription fails, the placeholder is removed, the draft remains editable, and the UI renders +a transcription error instead of submitting anything. + The composer also treats the textarea kill buffer as separate editing state from the visible draft. After submit or slash-command dispatch clears the textarea, the most recent `Ctrl+K` payload is still available for `Ctrl+Y`. This supports flows where a user kills part of a draft, runs a