diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock
index e079e3af0e..aca09dedb8 100644
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
@@ -1060,6 +1060,7 @@
       "home_0.5.12": "{\"dependencies\":[{\"features\":[\"Win32_Foundation\",\"Win32_UI_Shell\",\"Win32_System_Com\"],\"name\":\"windows-sys\",\"req\":\"^0.61\",\"target\":\"cfg(windows)\"}],\"features\":{}}",
       "home_0.5.9": "{\"dependencies\":[{\"features\":[\"Win32_Foundation\",\"Win32_UI_Shell\",\"Win32_System_Com\"],\"name\":\"windows-sys\",\"req\":\"^0.52\",\"target\":\"cfg(windows)\"}],\"features\":{}}",
       "hostname_0.4.2": "{\"dependencies\":[{\"name\":\"cfg-if\",\"req\":\"^1\"},{\"name\":\"libc\",\"req\":\"^0.2\",\"target\":\"cfg(any(unix, target_os = \\\"redox\\\"))\"},{\"kind\":\"dev\",\"name\":\"similar-asserts\",\"req\":\"^1.6.1\"},{\"kind\":\"dev\",\"name\":\"version-sync\",\"req\":\"^0.9\"},{\"kind\":\"dev\",\"name\":\"windows-bindgen\",\"req\":\"^0.65\"},{\"name\":\"windows-link\",\"req\":\"^0.2\",\"target\":\"cfg(target_os = \\\"windows\\\")\"}],\"features\":{\"default\":[],\"set\":[]}}",
+      "hound_3.5.1": "{\"dependencies\":[{\"kind\":\"dev\",\"name\":\"cpal\",\"req\":\"^0.2.12\"}],\"features\":{}}",
       "http-body-util_0.1.3": "{\"dependencies\":[{\"name\":\"bytes\",\"req\":\"^1\"},{\"default_features\":false,\"name\":\"futures-core\",\"req\":\"^0.3\"},{\"default_features\":false,\"kind\":\"dev\",\"name\":\"futures-util\",\"req\":\"^0.3\"},{\"name\":\"http\",\"req\":\"^1\"},{\"name\":\"http-body\",\"req\":\"^1\"},{\"name\":\"pin-project-lite\",\"req\":\"^0.2\"},{\"features\":[\"sync\"],\"name\":\"tokio\",\"optional\":true,\"req\":\"^1\"},{\"features\":[\"macros\",\"rt\",\"sync\",\"rt-multi-thread\"],\"kind\":\"dev\",\"name\":\"tokio\",\"req\":\"^1\"}],\"features\":{\"channel\":[\"dep:tokio\"],\"default\":[],\"full\":[\"channel\"]}}",
       "http-body_0.4.6": "{\"dependencies\":[{\"name\":\"bytes\",\"req\":\"^1\"},{\"name\":\"http\",\"req\":\"^0.2\"},{\"name\":\"pin-project-lite\",\"req\":\"^0.2\"},{\"features\":[\"macros\",\"rt\"],\"kind\":\"dev\",\"name\":\"tokio\",\"req\":\"^1\"}],\"features\":{}}",
       "http-body_1.0.1": "{\"dependencies\":[{\"name\":\"bytes\",\"req\":\"^1\"},{\"name\":\"http\",\"req\":\"^1\"}],\"features\":{}}",
diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock
index 2bfcbf88c7..ad79780465 100644
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@@ -3563,6 +3563,7 @@ dependencies = [
  "codex-arg0",
  "codex-chatgpt",
  "codex-cli",
+ "codex-client",
  "codex-cloud-requirements",
  "codex-config",
  "codex-connectors",
@@ -3607,6 +3608,7 @@ dependencies = [
  "diffy",
  "dirs",
  "dunce",
+ "hound",
  "image",
  "insta",
  "itertools 0.14.0",
@@ -7022,6 +7024,12 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "hound"
+version = "3.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
+
 [[package]]
 name = "http"
 version = "0.2.12"
@@ -10642,6 +10650,7 @@ dependencies = [
  "js-sys",
  "log",
  "mime",
+ "mime_guess",
  "native-tls",
  "percent-encoding",
  "pin-project-lite",
diff --git a/codex-rs/config/src/config_toml.rs b/codex-rs/config/src/config_toml.rs
index b8702072b2..6bf18fb45a 100644
--- a/codex-rs/config/src/config_toml.rs
+++ b/codex-rs/config/src/config_toml.rs
@@ -292,6 +292,10 @@ pub struct ConfigToml {
     #[serde(default)]
     pub audio: Option<RealtimeAudioToml>,
 
+    /// Delay before holding Space on a non-empty composer switches into voice
+    /// transcription instead of inserting a literal space.
+    pub voice_transcription_space_hold_delay_ms: Option<u64>,
+
     /// Experimental / do not use. Overrides only the realtime conversation
     /// websocket transport base URL (the `Op::RealtimeConversation`
     /// `/v1/realtime`
diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json
index 7bab6c6bf2..6648451162 100644
--- a/codex-rs/core/config.schema.json
+++ b/codex-rs/core/config.schema.json
@@ -590,6 +590,9 @@
             "use_linux_sandbox_bwrap": {
               "type": "boolean"
             },
+            "voice_transcription": {
+              "type": "boolean"
+            },
             "web_search": {
               "type": "boolean"
             },
@@ -3633,6 +3636,9 @@
         "use_linux_sandbox_bwrap": {
           "type": "boolean"
         },
+        "voice_transcription": {
+          "type": "boolean"
+        },
         "web_search": {
           "type": "boolean"
         },
@@ -4043,6 +4049,12 @@
       ],
       "description": "Collection of settings that are specific to the TUI."
     },
+    "voice_transcription_space_hold_delay_ms": {
+      "description": "Delay before holding Space on a non-empty composer switches into voice transcription instead of inserting a literal space.",
+      "format": "uint64",
+      "minimum": 0.0,
+      "type": "integer"
+    },
     "watchdog_interval_s": {
       "description": "Watchdog polling interval in seconds.",
       "format": "int64",
diff --git a/codex-rs/core/src/config/config_tests.rs b/codex-rs/core/src/config/config_tests.rs
index c002921043..a9c949b757 100644
--- a/codex-rs/core/src/config/config_tests.rs
+++ b/codex-rs/core/src/config/config_tests.rs
@@ -6045,6 +6045,8 @@ async fn test_precedence_fixture_with_o3_profile() -> std::io::Result<()> {
             personality: Some(Personality::Pragmatic),
             chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
             realtime_audio: RealtimeAudioConfig::default(),
+            voice_transcription_space_hold_delay_ms:
+                DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
             experimental_realtime_start_instructions: None,
             experimental_realtime_ws_base_url: None,
             experimental_realtime_ws_model: None,
@@ -6244,6 +6246,7 @@ async fn test_precedence_fixture_with_gpt3_profile() -> std::io::Result<()> {
         personality: Some(Personality::Pragmatic),
         chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
         realtime_audio: RealtimeAudioConfig::default(),
+        voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
         experimental_realtime_start_instructions: None,
         experimental_realtime_ws_base_url: None,
         experimental_realtime_ws_model: None,
@@ -6397,6 +6400,7 @@ async fn test_precedence_fixture_with_zdr_profile() -> std::io::Result<()> {
         personality: Some(Personality::Pragmatic),
         chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
         realtime_audio: RealtimeAudioConfig::default(),
+        voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
         experimental_realtime_start_instructions: None,
         experimental_realtime_ws_base_url: None,
         experimental_realtime_ws_model: None,
@@ -6535,6 +6539,7 @@ async fn test_precedence_fixture_with_gpt5_profile() -> std::io::Result<()> {
         personality: Some(Personality::Pragmatic),
         chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
         realtime_audio: RealtimeAudioConfig::default(),
+        voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
         experimental_realtime_start_instructions: None,
         experimental_realtime_ws_base_url: None,
         experimental_realtime_ws_model: None,
@@ -8677,6 +8682,29 @@ speaker = "Desk Speakers"
     Ok(())
 }
 
+#[tokio::test]
+async fn voice_transcription_space_hold_delay_loads_from_config_toml() -> std::io::Result<()> {
+    let cfg: ConfigToml = toml::from_str(
+        r#"
+voice_transcription_space_hold_delay_ms = 250
+"#,
+    )
+    .expect("TOML deserialization should succeed");
+
+    assert_eq!(cfg.voice_transcription_space_hold_delay_ms, Some(250));
+
+    let codex_home = TempDir::new()?;
+    let config = Config::load_from_base_config_with_overrides(
+        cfg,
+        ConfigOverrides::default(),
+        codex_home.abs(),
+    )
+    .await?;
+
+    assert_eq!(config.voice_transcription_space_hold_delay_ms, 250);
+    Ok(())
+}
+
 #[derive(Deserialize, Debug, PartialEq)]
 struct TuiTomlTest {
     #[serde(default, flatten)]
diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs
index dee8b09d30..5abef4fb52 100644
--- a/codex-rs/core/src/config/mod.rs
+++ b/codex-rs/core/src/config/mod.rs
@@ -168,6 +168,8 @@ pub(crate) const DEFAULT_AGENT_MAX_DEPTH: i32 = 1;
 pub(crate) const DEFAULT_AGENT_JOB_MAX_RUNTIME_SECONDS: Option<u64> = None;
 const LOCAL_DEV_BUILD_VERSION: &str = "0.0.0";
 pub(crate) const DEFAULT_WATCHDOG_INTERVAL_S: i64 = 10;
+/// Default delay before holding Space on a non-empty composer switches into voice transcription.
+pub const DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS: u64 = 1_000;
 
 pub const CONFIG_TOML_FILE: &str = "config.toml";
 
@@ -672,6 +674,10 @@ pub struct Config {
     /// Machine-local realtime audio device preferences used by realtime voice.
     pub realtime_audio: RealtimeAudioConfig,
 
+    /// Delay before holding Space on a non-empty composer switches into voice
+    /// transcription instead of inserting a literal space.
+    pub voice_transcription_space_hold_delay_ms: u64,
+
     /// Experimental / do not use. Overrides only the realtime conversation
     /// websocket transport base URL (the `Op::RealtimeConversation`
     /// `/v1/realtime`
@@ -2804,6 +2810,9 @@ impl Config {
                     microphone: audio.microphone,
                     speaker: audio.speaker,
                 }),
+            voice_transcription_space_hold_delay_ms: cfg
+                .voice_transcription_space_hold_delay_ms
+                .unwrap_or(DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS),
             experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url,
             experimental_realtime_ws_model: cfg.experimental_realtime_ws_model,
             realtime: cfg
diff --git a/codex-rs/features/src/lib.rs b/codex-rs/features/src/lib.rs
index 9fc287e8fb..ee9bb0310e 100644
--- a/codex-rs/features/src/lib.rs
+++ b/codex-rs/features/src/lib.rs
@@ -209,6 +209,8 @@ pub enum Feature {
     Artifact,
     /// Enable Fast mode selection in the TUI and request layer.
     FastMode,
+    /// Enable voice transcription in the TUI composer.
+    VoiceTranscription,
     /// Enable experimental realtime voice conversation mode in the TUI.
     RealtimeConversation,
     /// Connect app-server to the ChatGPT remote control service.
@@ -1014,6 +1016,12 @@ pub const FEATURES: &[FeatureSpec] = &[
         stage: Stage::Stable,
         default_enabled: true,
     },
+    FeatureSpec {
+        id: Feature::VoiceTranscription,
+        key: "voice_transcription",
+        stage: Stage::UnderDevelopment,
+        default_enabled: false,
+    },
     FeatureSpec {
         id: Feature::RealtimeConversation,
         key: "realtime_conversation",
diff --git a/codex-rs/tui/Cargo.toml b/codex-rs/tui/Cargo.toml
index 300449a414..8fc4e04b4e 100644
--- a/codex-rs/tui/Cargo.toml
+++ b/codex-rs/tui/Cargo.toml
@@ -31,6 +31,7 @@ codex-app-server-protocol = { workspace = true }
 codex-arg0 = { workspace = true }
 codex-install-context = { workspace = true }
 codex-chatgpt = { workspace = true }
+codex-client = { workspace = true }
 codex-cloud-requirements = { workspace = true }
 codex-config = { workspace = true }
 codex-connectors = { workspace = true }
@@ -83,7 +84,7 @@ ratatui = { workspace = true, features = [
 ] }
 ratatui-macros = { workspace = true }
 regex-lite = { workspace = true }
-reqwest = { workspace = true, features = ["json"] }
+reqwest = { workspace = true, features = ["json", "multipart"] }
 rmcp = { workspace = true }
 serde = { workspace = true, features = ["derive"] }
 serde_json = { workspace = true, features = ["preserve_order"] }
@@ -122,6 +123,7 @@ tokio-util = { workspace = true, features = ["time"] }
 
 [target.'cfg(not(target_os = "linux"))'.dependencies]
 cpal = "0.15"
+hound = "3.5"
 
 [target.'cfg(unix)'.dependencies]
 libc = { workspace = true }
diff --git a/codex-rs/tui/src/app/event_dispatch.rs b/codex-rs/tui/src/app/event_dispatch.rs
index d98888ba04..8c551e71d5 100644
--- a/codex-rs/tui/src/app/event_dispatch.rs
+++ b/codex-rs/tui/src/app/event_dispatch.rs
@@ -1723,6 +1723,25 @@ impl App {
                 }
             },
             #[cfg(not(target_os = "linux"))]
+            AppEvent::TranscriptionComplete { id, text } => {
+                self.chat_widget.replace_transcription(&id, &text);
+                tui.frame_requester().schedule_frame();
+            }
+            #[cfg(not(target_os = "linux"))]
+            AppEvent::TranscriptionFailed { id, error } => {
+                self.chat_widget.fail_transcription(&id, &error);
+            }
+            #[cfg(not(target_os = "linux"))]
+            AppEvent::TranscriptionRetrying {
+                id,
+                attempt,
+                max_attempts,
+            } => {
+                self.chat_widget
+                    .show_transcription_retrying(&id, attempt, max_attempts);
+                tui.frame_requester().schedule_frame();
+            }
+            #[cfg(not(target_os = "linux"))]
             AppEvent::UpdateRecordingMeter { id, text } => {
                 // Update in place to preserve the element id for subsequent frames.
                 let updated = self.chat_widget.update_recording_meter_in_place(&id, &text);
diff --git a/codex-rs/tui/src/app_event.rs b/codex-rs/tui/src/app_event.rs
index 97fb26b7fc..edbcab7d40 100644
--- a/codex-rs/tui/src/app_event.rs
+++ b/codex-rs/tui/src/app_event.rs
@@ -729,6 +729,29 @@ pub(crate) enum AppEvent {
         text: String,
     },
 
+    /// Voice transcription finished for the given placeholder id.
+    #[cfg(not(target_os = "linux"))]
+    TranscriptionComplete {
+        id: String,
+        text: String,
+    },
+
+    /// Voice transcription failed; remove the placeholder identified by `id`.
+    #[cfg(not(target_os = "linux"))]
+    TranscriptionFailed {
+        id: String,
+        #[allow(dead_code)]
+        error: String,
+    },
+
+    /// Voice transcription timed out or hit a transient failure and is retrying.
+    #[cfg(not(target_os = "linux"))]
+    TranscriptionRetrying {
+        id: String,
+        attempt: usize,
+        max_attempts: usize,
+    },
+
     /// Open the branch picker option from the review popup.
     OpenReviewBranchPicker(PathBuf),
 
diff --git a/codex-rs/tui/src/bottom_pane/chat_composer.rs b/codex-rs/tui/src/bottom_pane/chat_composer.rs
index c879743425..6897988d4a 100644
--- a/codex-rs/tui/src/bottom_pane/chat_composer.rs
+++ b/codex-rs/tui/src/bottom_pane/chat_composer.rs
@@ -41,6 +41,10 @@
 //! `Enter` submits immediately. `Tab` requests queuing while a task is running; if no task is
 //! running, `Tab` submits just like Enter so input is never dropped.
 //! `Tab` does not submit when entering a `!` shell command.
+//! When a voice transcription placeholder is still resolving, `Enter`/`Tab` records the submit or
+//! queue intent and waits for the transcription result before running the normal submission path.
+//! While that intent is pending, the draft is treated as committed and further key edits are
+//! ignored.
 //!
 //! On submit/queue paths, the composer:
 //!
@@ -121,11 +125,22 @@
 //! overall state machine, since it affects which transitions are even possible from a given UI
 //! state.
 //!
+//! # Voice Hold-To-Talk Without Key Release
+//!
+//! On terminals that do not report `KeyEventKind::Release`, space hold-to-talk uses repeated
+//! space key events as "still held" evidence:
+//!
+//! - For pending holds (non-empty composer), if the configured timeout elapses without any
+//!   repeated space event, we treat the key as a normal typed space.
+//! - If repeated space events are seen before timeout, we proceed with hold-to-talk.
+//! - While recording, repeated space events keep the recording alive; if they stop for a short
+//!   window, we stop and transcribe.
 use crate::bottom_pane::footer::goal_status_indicator_line;
 use crate::bottom_pane::footer::mode_indicator_line;
 use crate::key_hint;
 use crate::key_hint::KeyBinding;
 use crate::key_hint::has_ctrl_or_alt;
+use crate::legacy_core::config::DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS;
 use crate::line_truncation::truncate_line_with_ellipsis_if_overflow;
 use crate::ui_consts::FOOTER_INDENT_COLS;
 use crossterm::event::KeyCode;
@@ -230,12 +245,21 @@ use std::collections::HashSet;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::sync::Arc;
+#[cfg(not(target_os = "linux"))]
+use std::sync::Mutex;
+use std::sync::atomic::AtomicBool;
+#[cfg(not(target_os = "linux"))]
+use std::sync::atomic::Ordering;
+#[cfg(not(target_os = "linux"))]
+use std::thread;
 use std::time::Duration;
 use std::time::Instant;
 
 #[cfg(test)]
 use ratatui::style::Color;
-
+#[cfg(not(target_os = "linux"))]
+use tokio::runtime::Handle;
 /// If the pasted content exceeds this number of characters, replace it with a
 /// placeholder in the UI.
 const LARGE_PASTE_CHAR_THRESHOLD: usize = 1000;
@@ -297,6 +321,8 @@ pub(crate) struct ChatComposerConfig {
     pub(crate) slash_commands_enabled: bool,
     /// Whether pasting a file path can attach local images.
     pub(crate) image_paste_enabled: bool,
+    /// Delay before holding Space on a non-empty draft switches into voice capture.
+    pub(crate) voice_transcription_space_hold_delay_ms: u64,
 }
 
 impl Default for ChatComposerConfig {
@@ -305,6 +331,8 @@ impl Default for ChatComposerConfig {
             popups_enabled: true,
             slash_commands_enabled: true,
             image_paste_enabled: true,
+            voice_transcription_space_hold_delay_ms:
+                DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
         }
     }
 }
@@ -319,6 +347,63 @@ impl ChatComposerConfig {
             popups_enabled: false,
             slash_commands_enabled: false,
             image_paste_enabled: false,
+            voice_transcription_space_hold_delay_ms:
+                DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
+        }
+    }
+}
+
+#[derive(Default)]
+struct VoiceState {
+    transcription_enabled: bool,
+    space_hold_started_at: Option<Instant>,
+    space_hold_element_id: Option<String>,
+    space_hold_trigger: Option<Arc<AtomicBool>>,
+    key_release_supported: bool,
+    space_hold_repeat_seen: bool,
+    #[cfg(not(target_os = "linux"))]
+    voice: Option<crate::voice::VoiceCapture>,
+    #[cfg(not(target_os = "linux"))]
+    recording_placeholder_id: Option<String>,
+    #[cfg(not(target_os = "linux"))]
+    space_recording_started_at: Option<Instant>,
+    #[cfg(not(target_os = "linux"))]
+    space_recording_last_repeat_at: Option<Instant>,
+}
+
+#[cfg(not(target_os = "linux"))]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum TranscriptionSubmissionDisposition {
+    /// Submit as the next user turn once transcription completes.
+    Submit,
+    /// Queue for the end of the active turn once transcription completes.
+    Queue,
+}
+
+#[cfg(not(target_os = "linux"))]
+#[derive(Clone, Debug, PartialEq, Eq)]
+struct PendingTranscriptionSubmission {
+    /// Placeholder whose final text must arrive before submission can continue.
+    placeholder_id: String,
+    disposition: TranscriptionSubmissionDisposition,
+}
+
+#[cfg(not(target_os = "linux"))]
+#[derive(Clone, Debug, PartialEq, Eq)]
+enum RecordingStopOutcome {
+    /// No active recording was available to stop.
+    NoRecording,
+    /// Recording ended without starting transcription, for example because it was too short.
+    NoTranscription,
+    /// Transcription is now running for the retained placeholder.
+    Transcribing { placeholder_id: String },
+}
+
+impl VoiceState {
+    fn new(key_release_supported: bool) -> Self {
+        Self {
+            key_release_supported,
+            ..Default::default()
         }
     }
 }
@@ -344,6 +429,12 @@ pub(crate) struct ChatComposer {
     /// `[Image #M+1]..[Image #N]`, where `M` is the number of remote images.
     attached_images: Vec<AttachedImage>,
     placeholder_text: String,
+    voice_state: VoiceState,
+    #[cfg(not(target_os = "linux"))]
+    pending_transcription_submission: Option<PendingTranscriptionSubmission>,
+    // Spinner control flags keyed by placeholder id; set to true to stop.
+    #[cfg(not(target_os = "linux"))]
+    spinner_stop_flags: HashMap<String, Arc<AtomicBool>>,
     is_task_running: bool,
     /// When false, the composer is temporarily read-only (e.g. during sandbox setup).
     input_enabled: bool,
@@ -545,6 +636,11 @@ impl ChatComposer {
             frame_requester: None,
             attached_images: Vec::new(),
             placeholder_text,
+            voice_state: VoiceState::new(enhanced_keys_supported),
+            #[cfg(not(target_os = "linux"))]
+            pending_transcription_submission: None,
+            #[cfg(not(target_os = "linux"))]
+            spinner_stop_flags: HashMap::new(),
             is_task_running: false,
             input_enabled: true,
             input_disabled_placeholder: None,
@@ -747,6 +843,32 @@ impl ChatComposer {
     /// Compatibility shim for tests that still toggle the removed steer mode flag.
     #[cfg(test)]
     pub fn set_steer_enabled(&mut self, _enabled: bool) {}
+
+    pub fn set_voice_transcription_enabled(&mut self, enabled: bool) {
+        self.voice_state.transcription_enabled = enabled;
+        if !enabled {
+            self.voice_state.space_hold_started_at = None;
+            if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                let _ = self.textarea.replace_element_by_id(&id, " ");
+            }
+            self.voice_state.space_hold_trigger = None;
+            self.voice_state.space_hold_repeat_seen = false;
+            #[cfg(not(target_os = "linux"))]
+            {
+                self.pending_transcription_submission = None;
+            }
+        }
+    }
+
+    pub(crate) fn set_voice_transcription_space_hold_delay_ms(&mut self, delay_ms: u64) {
+        self.config.voice_transcription_space_hold_delay_ms = delay_ms;
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn voice_transcription_enabled(&self) -> bool {
+        self.voice_state.transcription_enabled && cfg!(not(target_os = "linux"))
+    }
+
     /// Centralized feature gating keeps config checks out of call sites.
     fn popups_enabled(&self) -> bool {
         self.config.popups_enabled
@@ -969,6 +1091,10 @@ impl ChatComposer {
     /// remote images). Cursor is placed at the end after rebuilding elements.
     pub(crate) fn apply_external_edit(&mut self, text: String) {
         self.pending_pastes.clear();
+        #[cfg(not(target_os = "linux"))]
+        {
+            self.pending_transcription_submission = None;
+        }
         let (text, _) = self.imported_text_for_textarea(text, Vec::new());
 
         // Count placeholder occurrences in the new text.
@@ -1158,6 +1284,13 @@ impl ChatComposer {
         local_image_paths: Vec<PathBuf>,
         mention_bindings: Vec<MentionBinding>,
     ) {
+        #[cfg(not(target_os = "linux"))]
+        self.stop_all_transcription_spinners();
+        #[cfg(not(target_os = "linux"))]
+        {
+            self.pending_transcription_submission = None;
+        }
+
         // Clear any existing content, placeholders, and attachments first.
         self.textarea.set_text_clearing_elements("");
         self.is_bash_mode = false;
@@ -1546,11 +1679,20 @@ impl ChatComposer {
 
     /// Handle a key event coming from the main UI.
     pub fn handle_key_event(&mut self, key_event: KeyEvent) -> (InputResult, bool) {
+        if matches!(key_event.kind, KeyEventKind::Release) {
+            self.voice_state.key_release_supported = true;
+        }
+
+        if let Some(result) = self.handle_key_event_while_recording(key_event) {
+            return result;
+        }
+
         if !self.input_enabled {
             return (InputResult::None, false);
         }
 
-        if matches!(key_event.kind, KeyEventKind::Release) {
+        #[cfg(not(target_os = "linux"))]
+        if self.pending_transcription_submission.is_some() {
             return (InputResult::None, false);
         }
 
@@ -1562,6 +1704,29 @@ impl ChatComposer {
             return self.begin_history_search();
         }
 
+        // Outside of recording, ignore all key releases globally except for Space,
+        // which is handled explicitly for hold-to-talk behavior below.
+        if matches!(key_event.kind, KeyEventKind::Release)
+            && !matches!(key_event.code, KeyCode::Char(' '))
+        {
+            return (InputResult::None, false);
+        }
+
+        if self.voice_state.space_hold_started_at.is_some()
+            && !matches!(key_event.code, KeyCode::Char(' '))
+        {
+            self.voice_state.space_hold_started_at = None;
+            if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                let _ = self.textarea.replace_element_by_id(&id, " ");
+            }
+            self.voice_state.space_hold_trigger = None;
+            self.voice_state.space_hold_repeat_seen = false;
+        }
+
+        if let Some(result) = self.handle_voice_space_key_event(&key_event) {
+            return result;
+        }
+
         let result = match &mut self.active_popup {
             ActivePopup::Command(_) => self.handle_key_event_with_slash_popup(key_event),
             ActivePopup::File(_) => self.handle_key_event_with_file_popup(key_event),
@@ -2574,6 +2739,12 @@ impl ChatComposer {
         should_queue: bool,
         now: Instant,
     ) -> (InputResult, bool) {
+        #[cfg(not(target_os = "linux"))]
+        if self.defer_active_transcription_submission(Self::transcription_disposition(should_queue))
+        {
+            return (InputResult::None, true);
+        }
+
         if should_queue {
             let raw_text = self.textarea.text();
             let defer_slash_validation =
@@ -2596,7 +2767,6 @@ impl ChatComposer {
                     true,
                 );
             }
-            return (InputResult::None, true);
         }
 
         // If the first line is a bare built-in slash command (no args),
@@ -3046,6 +3216,150 @@ impl ChatComposer {
         }
     }
 
+    #[cfg(target_os = "linux")]
+    fn handle_voice_space_key_event(
+        &mut self,
+        _key_event: &KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        None
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn handle_voice_space_key_event(
+        &mut self,
+        key_event: &KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        if !self.voice_transcription_enabled() || !matches!(key_event.code, KeyCode::Char(' ')) {
+            return None;
+        }
+        match key_event.kind {
+            KeyEventKind::Press => {
+                if self.paste_burst.is_active() {
+                    return None;
+                }
+
+                // If textarea is empty, start recording immediately without inserting a space.
+                if self.textarea.text().is_empty() {
+                    if self.start_recording_with_placeholder() {
+                        return Some((InputResult::None, true));
+                    }
+                    return None;
+                }
+
+                // If a hold is already pending, swallow further press events to
+                // avoid inserting multiple spaces and resetting the timer on key repeat.
+                if self.voice_state.space_hold_started_at.is_some() {
+                    if !self.voice_state.key_release_supported {
+                        self.voice_state.space_hold_repeat_seen = true;
+                    }
+                    return Some((InputResult::None, false));
+                }
+
+                // Insert a named element that renders as a space so we can later
+                // remove it on timeout or convert it to a plain space on release.
+                let elem_id = self.next_id();
+                self.textarea.insert_named_element(" ", elem_id.clone());
+
+                // Record pending hold metadata.
+                self.voice_state.space_hold_started_at = Some(Instant::now());
+                self.voice_state.space_hold_element_id = Some(elem_id);
+                self.voice_state.space_hold_repeat_seen = false;
+
+                // Spawn a delayed task to flip an atomic flag; we check it on next key event.
+                let flag = Arc::new(AtomicBool::new(false));
+                let frame = self.frame_requester.clone();
+                Self::schedule_space_hold_timer(
+                    flag.clone(),
+                    frame,
+                    self.config.voice_transcription_space_hold_delay_ms,
+                );
+                self.voice_state.space_hold_trigger = Some(flag);
+
+                Some((InputResult::None, true))
+            }
+            // If we see a repeat before release, handling occurs in the top-level pending block.
+            KeyEventKind::Repeat => {
+                // Swallow repeats while a hold is pending to avoid extra spaces.
+                if self.voice_state.space_hold_started_at.is_some() {
+                    if !self.voice_state.key_release_supported {
+                        self.voice_state.space_hold_repeat_seen = true;
+                    }
+                    return Some((InputResult::None, false));
+                }
+                // Fallback: if no pending hold, treat as normal input.
+                None
+            }
+            // Space release without pending (fallback): treat as normal input.
+            KeyEventKind::Release => {
+                // If a hold is pending, convert the element to a plain space and clear state.
+                self.voice_state.space_hold_started_at = None;
+                if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                    let _ = self.textarea.replace_element_by_id(&id, " ");
+                }
+                self.voice_state.space_hold_trigger = None;
+                self.voice_state.space_hold_repeat_seen = false;
+                Some((InputResult::None, true))
+            }
+        }
+    }
+
+    #[cfg(target_os = "linux")]
+    fn handle_key_event_while_recording(
+        &mut self,
+        _key_event: KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        None
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn handle_key_event_while_recording(
+        &mut self,
+        key_event: KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        if self.voice_state.voice.is_some() {
+            let should_stop = if self.voice_state.key_release_supported {
+                match key_event.kind {
+                    KeyEventKind::Release => matches!(key_event.code, KeyCode::Char(' ')),
+                    KeyEventKind::Press | KeyEventKind::Repeat => {
+                        !matches!(key_event.code, KeyCode::Char(' '))
+                    }
+                }
+            } else {
+                match key_event.kind {
+                    KeyEventKind::Release => matches!(key_event.code, KeyCode::Char(' ')),
+                    KeyEventKind::Press | KeyEventKind::Repeat => {
+                        if matches!(key_event.code, KeyCode::Char(' ')) {
+                            self.voice_state.space_recording_last_repeat_at = Some(Instant::now());
+                            false
+                        } else {
+                            true
+                        }
+                    }
+                }
+            };
+
+            if should_stop {
+                let disposition = self.transcription_disposition_for_key(&key_event);
+                let outcome = self.stop_recording_and_start_transcription();
+                let needs_redraw = !matches!(&outcome, RecordingStopOutcome::NoRecording);
+                if let (RecordingStopOutcome::Transcribing { placeholder_id }, Some(disposition)) =
+                    (outcome, disposition)
+                {
+                    self.pending_transcription_submission = Some(PendingTranscriptionSubmission {
+                        placeholder_id,
+                        disposition,
+                    });
+                }
+                return Some((InputResult::None, needs_redraw));
+            }
+
+            // Swallow non-stopping keys while recording.
+            return Some((InputResult::None, false));
+        }
+
+        None
+    }
+
     fn is_bang_shell_command(&self) -> bool {
         self.current_text().trim_start().starts_with('!')
     }
@@ -3877,6 +4191,40 @@ impl ChatComposer {
         }
     }
 
+    #[cfg(not(target_os = "linux"))]
+    pub(crate) fn is_recording(&self) -> bool {
+        self.voice_state.voice.is_some()
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn schedule_space_hold_timer(
+        flag: Arc<AtomicBool>,
+        frame: Option<FrameRequester>,
+        hold_delay_millis: u64,
+    ) {
+        if let Ok(handle) = Handle::try_current() {
+            let flag_clone = flag;
+            let frame_clone = frame;
+            handle.spawn(async move {
+                tokio::time::sleep(Duration::from_millis(hold_delay_millis)).await;
+                Self::complete_space_hold_timer(flag_clone, frame_clone);
+            });
+        } else {
+            thread::spawn(move || {
+                thread::sleep(Duration::from_millis(hold_delay_millis));
+                Self::complete_space_hold_timer(flag, frame);
+            });
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn complete_space_hold_timer(flag: Arc<AtomicBool>, frame: Option<FrameRequester>) {
+        flag.store(true, Ordering::Relaxed);
+        if let Some(frame) = frame {
+            frame.schedule_frame();
+        }
+    }
+
     pub(crate) fn set_status_line(&mut self, status_line: Option<Line<'static>>) -> bool {
         if self.status_line_value == status_line {
             return false;
@@ -3934,10 +4282,364 @@ fn footer_insert_newline_key(
 
 #[cfg(not(target_os = "linux"))]
 impl ChatComposer {
+    fn transcription_disposition(should_queue: bool) -> TranscriptionSubmissionDisposition {
+        if should_queue {
+            TranscriptionSubmissionDisposition::Queue
+        } else {
+            TranscriptionSubmissionDisposition::Submit
+        }
+    }
+
+    fn transcription_disposition_for_key(
+        &self,
+        key_event: &KeyEvent,
+    ) -> Option<TranscriptionSubmissionDisposition> {
+        match key_event {
+            KeyEvent {
+                code: KeyCode::Enter,
+                modifiers: KeyModifiers::NONE,
+                kind: KeyEventKind::Press | KeyEventKind::Repeat,
+                ..
+            } => Some(TranscriptionSubmissionDisposition::Submit),
+            KeyEvent {
+                code: KeyCode::Tab,
+                modifiers: KeyModifiers::NONE,
+                kind: KeyEventKind::Press,
+                ..
+            } if !self.is_bang_shell_command() => {
+                Some(Self::transcription_disposition(self.is_task_running))
+            }
+            _ => None,
+        }
+    }
+
+    fn active_transcription_placeholder_id(&self) -> Option<String> {
+        self.spinner_stop_flags
+            .keys()
+            .find(|id| self.textarea.named_element_range(id).is_some())
+            .cloned()
+    }
+
+    fn defer_active_transcription_submission(
+        &mut self,
+        disposition: TranscriptionSubmissionDisposition,
+    ) -> bool {
+        match self.pending_transcription_submission {
+            Some(_) => true,
+            None => match self.active_transcription_placeholder_id() {
+                Some(placeholder_id) => {
+                    self.pending_transcription_submission = Some(PendingTranscriptionSubmission {
+                        placeholder_id,
+                        disposition,
+                    });
+                    true
+                }
+                None => false,
+            },
+        }
+    }
+
+    fn complete_pending_transcription_submission(&mut self, id: &str) -> InputResult {
+        match self.pending_transcription_submission.take() {
+            Some(PendingTranscriptionSubmission {
+                placeholder_id,
+                disposition,
+            }) if placeholder_id == id => match disposition {
+                TranscriptionSubmissionDisposition::Submit => {
+                    self.handle_submission(/*should_queue*/ false).0
+                }
+                TranscriptionSubmissionDisposition::Queue => {
+                    self.handle_submission(/*should_queue*/ true).0
+                }
+            },
+            Some(pending_submission) => {
+                self.pending_transcription_submission = Some(pending_submission);
+                InputResult::None
+            }
+            None => InputResult::None,
+        }
+    }
+
+    pub(crate) fn process_space_hold_trigger(&mut self) {
+        if self.voice_transcription_enabled()
+            && let Some(flag) = self.voice_state.space_hold_trigger.as_ref()
+            && flag.load(Ordering::Relaxed)
+            && self.voice_state.space_hold_started_at.is_some()
+            && self.voice_state.voice.is_none()
+        {
+            let _ = self.on_space_hold_timeout();
+        }
+
+        const SPACE_REPEAT_INITIAL_GRACE_MILLIS: u64 = 700;
+        const SPACE_REPEAT_IDLE_TIMEOUT_MILLIS: u64 = 250;
+        if !self.voice_state.key_release_supported && self.voice_state.voice.is_some() {
+            let now = Instant::now();
+            let initial_grace = Duration::from_millis(SPACE_REPEAT_INITIAL_GRACE_MILLIS);
+            let repeat_idle_timeout = Duration::from_millis(SPACE_REPEAT_IDLE_TIMEOUT_MILLIS);
+            if let Some(started_at) = self.voice_state.space_recording_started_at
+                && now.saturating_duration_since(started_at) >= initial_grace
+            {
+                let should_stop = match self.voice_state.space_recording_last_repeat_at {
+                    Some(last_repeat_at) => {
+                        now.saturating_duration_since(last_repeat_at) >= repeat_idle_timeout
+                    }
+                    None => true,
+                };
+                if should_stop {
+                    let _ = self.stop_recording_and_start_transcription();
+                }
+            }
+        }
+    }
+
+    /// Called when the configured non-empty-composer space hold timeout elapses.
+    ///
+    /// On terminals without key-release reporting, this only transitions into voice capture if we
+    /// observed repeated Space events while pending; otherwise the keypress is treated as a typed
+    /// space.
+    pub(crate) fn on_space_hold_timeout(&mut self) -> bool {
+        if !self.voice_transcription_enabled() {
+            return false;
+        }
+        if self.voice_state.voice.is_some() {
+            return false;
+        }
+        if self.voice_state.space_hold_started_at.is_some() {
+            if !self.voice_state.key_release_supported && !self.voice_state.space_hold_repeat_seen {
+                if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                    let _ = self.textarea.replace_element_by_id(&id, " ");
+                }
+                self.voice_state.space_hold_started_at = None;
+                self.voice_state.space_hold_trigger = None;
+                self.voice_state.space_hold_repeat_seen = false;
+                return true;
+            }
+
+            // Preserve the typed space when transitioning into voice capture, but
+            // avoid duplicating an existing trailing space. In either case,
+            // convert/remove the temporary named element before inserting the
+            // recording/transcribing placeholder.
+            if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                let replacement = if self
+                    .textarea
+                    .named_element_range(&id)
+                    .and_then(|range| self.textarea.text()[..range.start].chars().next_back())
+                    .is_some_and(|ch| ch == ' ')
+                {
+                    ""
+                } else {
+                    " "
+                };
+                let _ = self.textarea.replace_element_by_id(&id, replacement);
+            }
+            // Clear pending state before starting capture
+            self.voice_state.space_hold_started_at = None;
+            self.voice_state.space_hold_trigger = None;
+            self.voice_state.space_hold_repeat_seen = false;
+
+            // Start voice capture
+            self.start_recording_with_placeholder()
+        } else {
+            false
+        }
+    }
+
+    /// Stop recording if active, update the placeholder, and spawn background transcription.
+    fn stop_recording_and_start_transcription(&mut self) -> RecordingStopOutcome {
+        let Some(vc) = self.voice_state.voice.take() else {
+            return RecordingStopOutcome::NoRecording;
+        };
+        self.voice_state.space_recording_started_at = None;
+        self.voice_state.space_recording_last_repeat_at = None;
+        match vc.stop() {
+            Ok(audio) => {
+                // If the recording is too short, remove the placeholder immediately
+                // and skip the transcribing state entirely.
+                let total_samples = audio.data.len() as f32;
+                let samples_per_second = (audio.sample_rate as f32) * (audio.channels as f32);
+                let duration_seconds = if samples_per_second > 0.0 {
+                    total_samples / samples_per_second
+                } else {
+                    0.0
+                };
+                const MIN_DURATION_SECONDS: f32 = 1.0;
+                if duration_seconds < MIN_DURATION_SECONDS {
+                    if let Some(id) = self.voice_state.recording_placeholder_id.take() {
+                        let _ = self.textarea.replace_element_by_id(&id, "");
+                    }
+                    return RecordingStopOutcome::NoTranscription;
+                }
+
+                // Otherwise, update the placeholder to show a spinner and proceed.
+                let id = match self.voice_state.recording_placeholder_id.take() {
+                    Some(id) => id,
+                    None => self.next_id(),
+                };
+
+                let placeholder_range = self.textarea.named_element_range(&id);
+                let prompt_source = if let Some(range) = &placeholder_range {
+                    self.textarea.text()[..range.start].to_string()
+                } else {
+                    self.textarea.text().to_string()
+                };
+
+                // Initialize with first spinner frame immediately.
+                let _ = self.textarea.update_named_element_by_id(&id, "⠋");
+                // Spawn animated braille spinner until transcription finishes (or times out).
+                self.spawn_transcribing_spinner(id.clone());
+                let tx = self.app_event_tx.clone();
+                crate::voice::transcribe_async(id.clone(), audio, Some(prompt_source), tx);
+                RecordingStopOutcome::Transcribing { placeholder_id: id }
+            }
+            Err(e) => {
+                tracing::error!("failed to stop voice capture: {e}");
+                RecordingStopOutcome::NoTranscription
+            }
+        }
+    }
+
+    /// Start voice capture and insert a placeholder element for the live meter.
+    /// Returns true if recording began and UI should redraw; false on failure.
+    fn start_recording_with_placeholder(&mut self) -> bool {
+        match crate::voice::VoiceCapture::start() {
+            Ok(vc) => {
+                self.voice_state.voice = Some(vc);
+                if self.voice_state.key_release_supported {
+                    self.voice_state.space_recording_started_at = None;
+                } else {
+                    self.voice_state.space_recording_started_at = Some(Instant::now());
+                }
+                self.voice_state.space_recording_last_repeat_at = None;
+                // Insert visible placeholder for the meter (no label)
+                let id = self.next_id();
+                self.textarea.insert_named_element("", id.clone());
+                self.voice_state.recording_placeholder_id = Some(id);
+                // Spawn metering animation
+                if let Some(v) = &self.voice_state.voice {
+                    let data = v.data_arc();
+                    let stop = v.stopped_flag();
+                    let sr = v.sample_rate();
+                    let ch = v.channels();
+                    let peak = v.last_peak_arc();
+                    if let Some(idref) = &self.voice_state.recording_placeholder_id {
+                        self.spawn_recording_meter(idref.clone(), sr, ch, data, peak, stop);
+                    }
+                }
+                true
+            }
+            Err(e) => {
+                self.voice_state.space_recording_started_at = None;
+                self.voice_state.space_recording_last_repeat_at = None;
+                tracing::error!("failed to start voice capture: {e}");
+                false
+            }
+        }
+    }
+
+    fn spawn_recording_meter(
+        &self,
+        id: String,
+        _sample_rate: u32,
+        _channels: u16,
+        _data: Arc<Mutex<Vec<i16>>>,
+        last_peak: Arc<std::sync::atomic::AtomicU16>,
+        stop: Arc<std::sync::atomic::AtomicBool>,
+    ) {
+        let tx = self.app_event_tx.clone();
+        let task = move || {
+            use std::time::Duration;
+            let mut meter = crate::voice::RecordingMeterState::new();
+            loop {
+                if stop.load(Ordering::Relaxed) {
+                    break;
+                }
+                let text = meter.next_text(last_peak.load(Ordering::Relaxed));
+                tx.send(crate::app_event::AppEvent::UpdateRecordingMeter {
+                    id: id.clone(),
+                    text,
+                });
+
+                thread::sleep(Duration::from_millis(100));
+            }
+        };
+
+        if let Ok(handle) = Handle::try_current() {
+            handle.spawn_blocking(task);
+        } else {
+            thread::spawn(task);
+        }
+    }
+
+    fn spawn_transcribing_spinner(&mut self, id: String) {
+        self.stop_transcription_spinner(&id);
+        let stop = Arc::new(AtomicBool::new(false));
+        self.spinner_stop_flags
+            .insert(id.clone(), Arc::clone(&stop));
+
+        let tx = self.app_event_tx.clone();
+        let task = move || {
+            use std::time::Duration;
+            let frames: Vec<&'static str> = vec!["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
+            let mut i: usize = 0;
+            // Safety stop after ~60s to avoid a runaway task if events are lost.
+            let max_ticks = 600usize; // 600 * 100ms = 60s
+            for _ in 0..max_ticks {
+                if stop.load(Ordering::Relaxed) {
+                    break;
+                }
+                let text = frames[i % frames.len()].to_string();
+                tx.send(crate::app_event::AppEvent::UpdateRecordingMeter {
+                    id: id.clone(),
+                    text,
+                });
+                i = i.wrapping_add(1);
+                thread::sleep(Duration::from_millis(100));
+            }
+        };
+
+        if let Ok(handle) = Handle::try_current() {
+            handle.spawn_blocking(task);
+        } else {
+            thread::spawn(task);
+        }
+    }
+
+    fn stop_transcription_spinner(&mut self, id: &str) {
+        if let Some(flag) = self.spinner_stop_flags.remove(id) {
+            flag.store(true, Ordering::Relaxed);
+        }
+    }
+
+    fn stop_all_transcription_spinners(&mut self) {
+        for (_id, flag) in self.spinner_stop_flags.drain() {
+            flag.store(true, Ordering::Relaxed);
+        }
+    }
+
+    pub fn replace_transcription(&mut self, id: &str, text: &str) -> InputResult {
+        self.stop_transcription_spinner(id);
+        if self.textarea.replace_element_by_id(id, text) {
+            self.complete_pending_transcription_submission(id)
+        } else {
+            InputResult::None
+        }
+    }
+
     pub fn update_recording_meter_in_place(&mut self, id: &str, text: &str) -> bool {
         self.textarea.update_named_element_by_id(id, text)
     }
 
+    pub fn show_transcription_retrying(
+        &mut self,
+        id: &str,
+        attempt: usize,
+        max_attempts: usize,
+    ) -> bool {
+        self.stop_transcription_spinner(id);
+        self.textarea
+            .update_named_element_by_id(id, &format!("retrying {attempt}/{max_attempts}"))
+    }
+
     pub fn insert_recording_meter_placeholder(&mut self, text: &str) -> String {
         let id = self.next_id();
         self.textarea.insert_named_element(text, id.clone());
@@ -3947,6 +4649,18 @@ impl ChatComposer {
     pub fn remove_recording_meter_placeholder(&mut self, id: &str) {
         let _ = self.textarea.replace_element_by_id(id, "");
     }
+
+    pub fn remove_transcription_placeholder(&mut self, id: &str) {
+        self.stop_transcription_spinner(id);
+        if self
+            .pending_transcription_submission
+            .as_ref()
+            .is_some_and(|pending| pending.placeholder_id == id)
+        {
+            self.pending_transcription_submission = None;
+        }
+        let _ = self.textarea.replace_element_by_id(id, "");
+    }
 }
 
 fn skill_description(skill: &SkillMetadata) -> Option<String> {
@@ -7357,6 +8071,366 @@ mod tests {
         assert_queued_shell(" !echo hi", "!echo hi");
     }
 
+    #[test]
+    fn voice_transcription_disabled_treats_space_as_normal_input() {
+        use crossterm::event::KeyCode;
+        use crossterm::event::KeyEvent;
+        use crossterm::event::KeyEventKind;
+        use crossterm::event::KeyModifiers;
+
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ true,
+        );
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+
+        let _ = composer.handle_key_event(KeyEvent::new(KeyCode::Char(' '), KeyModifiers::NONE));
+        let _ = composer.handle_key_event(KeyEvent::new_with_kind(
+            KeyCode::Char(' '),
+            KeyModifiers::NONE,
+            KeyEventKind::Release,
+        ));
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(composer.voice_state.space_hold_element_id.is_none());
+        assert!(composer.voice_state.space_hold_trigger.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn space_hold_timeout_without_release_or_repeat_keeps_typed_space() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+        composer.set_voice_transcription_enabled(/*enabled*/ true);
+
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+        let elem_id = "space-hold".to_string();
+        composer.textarea.insert_named_element(" ", elem_id.clone());
+        composer.voice_state.space_hold_started_at = Some(Instant::now());
+        composer.voice_state.space_hold_element_id = Some(elem_id);
+        composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true)));
+        composer.voice_state.key_release_supported = false;
+        composer.voice_state.space_hold_repeat_seen = false;
+        assert_eq!("x ", composer.textarea.text());
+
+        composer.process_space_hold_trigger();
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn space_hold_timeout_with_repeat_uses_hold_path_without_release() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+        composer.set_voice_transcription_enabled(/*enabled*/ true);
+
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+        let elem_id = "space-hold".to_string();
+        composer.textarea.insert_named_element(" ", elem_id.clone());
+        composer.voice_state.space_hold_started_at = Some(Instant::now());
+        composer.voice_state.space_hold_element_id = Some(elem_id);
+        composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true)));
+        composer.voice_state.key_release_supported = false;
+        composer.voice_state.space_hold_repeat_seen = true;
+
+        composer.process_space_hold_trigger();
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+        if composer.is_recording() {
+            let _ = composer.stop_recording_and_start_transcription();
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn space_hold_timeout_with_repeat_does_not_duplicate_existing_space() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+        composer.set_voice_transcription_enabled(/*enabled*/ true);
+
+        composer.set_text_content("x ".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+        let elem_id = "space-hold".to_string();
+        composer.textarea.insert_named_element(" ", elem_id.clone());
+        composer.voice_state.space_hold_started_at = Some(Instant::now());
+        composer.voice_state.space_hold_element_id = Some(elem_id);
+        composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true)));
+        composer.voice_state.key_release_supported = false;
+        composer.voice_state.space_hold_repeat_seen = true;
+
+        composer.process_space_hold_trigger();
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+        if composer.is_recording() {
+            let _ = composer.stop_recording_and_start_transcription();
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn configurable_space_hold_delay_is_used_for_non_empty_drafts() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+        composer.set_voice_transcription_enabled(/*enabled*/ true);
+        composer.set_voice_transcription_space_hold_delay_ms(/*delay_ms*/ 1);
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+
+        let _ = composer.handle_key_event(KeyEvent::new(KeyCode::Char(' '), KeyModifiers::NONE));
+        std::thread::sleep(Duration::from_millis(50));
+        composer.process_space_hold_trigger();
+
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(composer.voice_state.space_hold_trigger.is_none());
+        assert_eq!("x ", composer.textarea.text());
+        if composer.is_recording() {
+            let _ = composer.stop_recording_and_start_transcription();
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn replace_transcription_stops_spinner_for_placeholder() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+
+        let id = "voice-placeholder".to_string();
+        composer.textarea.insert_named_element("", id.clone());
+        let flag = Arc::new(AtomicBool::new(false));
+        composer
+            .spinner_stop_flags
+            .insert(id.clone(), Arc::clone(&flag));
+
+        composer.replace_transcription(&id, "transcribed text");
+
+        assert!(flag.load(Ordering::Relaxed));
+        assert!(!composer.spinner_stop_flags.contains_key(&id));
+        assert_eq!(composer.textarea.text(), "transcribed text");
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn transcription_retrying_stops_spinner_and_updates_placeholder() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+
+        let id = "voice-placeholder".to_string();
+        composer.textarea.insert_named_element("", id.clone());
+        let flag = Arc::new(AtomicBool::new(false));
+        composer
+            .spinner_stop_flags
+            .insert(id.clone(), Arc::clone(&flag));
+
+        let updated =
+            composer.show_transcription_retrying(&id, /*attempt*/ 2, /*max_attempts*/ 3);
+
+        assert!(updated);
+        assert!(flag.load(Ordering::Relaxed));
+        assert!(!composer.spinner_stop_flags.contains_key(&id));
+        assert_eq!(composer.textarea.text(), "retrying 2/3");
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn enter_defers_submission_until_transcription_completes() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+
+        let id = composer.insert_recording_meter_placeholder("⠋");
+        composer
+            .spinner_stop_flags
+            .insert(id.clone(), Arc::new(AtomicBool::new(false)));
+
+        let (result, _needs_redraw) =
+            composer.handle_key_event(KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE));
+
+        assert_eq!(InputResult::None, result);
+        assert_eq!("⠋", composer.textarea.text());
+
+        let (result, _needs_redraw) =
+            composer.handle_key_event(KeyEvent::new(KeyCode::Char('x'), KeyModifiers::NONE));
+
+        assert_eq!(InputResult::None, result);
+        assert_eq!("⠋", composer.textarea.text());
+
+        let result = composer.replace_transcription(&id, "transcribed text");
+
+        assert_eq!(
+            InputResult::Submitted {
+                text: "transcribed text".to_string(),
+                text_elements: Vec::new()
+            },
+            result
+        );
+        assert_eq!("", composer.textarea.text());
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn tab_defers_queue_until_transcription_completes() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+        composer.set_task_running(/*running*/ true);
+
+        let id = composer.insert_recording_meter_placeholder("⠋");
+        composer
+            .spinner_stop_flags
+            .insert(id.clone(), Arc::new(AtomicBool::new(false)));
+
+        let (result, _needs_redraw) =
+            composer.handle_key_event(KeyEvent::new(KeyCode::Tab, KeyModifiers::NONE));
+
+        assert_eq!(InputResult::None, result);
+        assert_eq!("⠋", composer.textarea.text());
+
+        let result = composer.replace_transcription(&id, "queued voice text");
+
+        assert_eq!(
+            InputResult::Queued {
+                text: "queued voice text".to_string(),
+                text_elements: Vec::new(),
+                action: QueuedInputAction::Plain,
+            },
+            result
+        );
+        assert_eq!("", composer.textarea.text());
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn failed_transcription_keeps_draft_and_allows_editing_again() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+        composer.textarea.set_text_clearing_elements("draft ");
+        composer.move_cursor_to_end();
+
+        let id = composer.insert_recording_meter_placeholder("⠋");
+        composer
+            .spinner_stop_flags
+            .insert(id.clone(), Arc::new(AtomicBool::new(false)));
+
+        let (result, _needs_redraw) =
+            composer.handle_key_event(KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE));
+
+        assert_eq!(InputResult::None, result);
+
+        composer.remove_transcription_placeholder(&id);
+
+        assert_eq!("draft ", composer.textarea.text());
+
+        type_chars_humanlike(&mut composer, &['x']);
+        assert_eq!("draft x", composer.textarea.text());
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn set_text_content_stops_all_transcription_spinners() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            /*has_input_focus*/ true,
+            sender,
+            /*enhanced_keys_supported*/ false,
+            "Ask Codex to do anything".to_string(),
+            /*disable_paste_burst*/ false,
+        );
+
+        let flag_one = Arc::new(AtomicBool::new(false));
+        let flag_two = Arc::new(AtomicBool::new(false));
+        composer
+            .spinner_stop_flags
+            .insert("voice-1".to_string(), Arc::clone(&flag_one));
+        composer
+            .spinner_stop_flags
+            .insert("voice-2".to_string(), Arc::clone(&flag_two));
+
+        composer.set_text_content("draft".to_string(), Vec::new(), Vec::new());
+
+        assert!(flag_one.load(Ordering::Relaxed));
+        assert!(flag_two.load(Ordering::Relaxed));
+        assert!(composer.spinner_stop_flags.is_empty());
+    }
+
     #[test]
     fn slash_tab_completion_moves_cursor_to_end() {
         use crossterm::event::KeyCode;
diff --git a/codex-rs/tui/src/bottom_pane/mod.rs b/codex-rs/tui/src/bottom_pane/mod.rs
index 02275755b7..e1e6638905 100644
--- a/codex-rs/tui/src/bottom_pane/mod.rs
+++ b/codex-rs/tui/src/bottom_pane/mod.rs
@@ -411,6 +411,17 @@ impl BottomPane {
         self.request_redraw();
     }
 
+    pub fn set_voice_transcription_enabled(&mut self, enabled: bool) {
+        self.composer.set_voice_transcription_enabled(enabled);
+        self.request_redraw();
+    }
+
+    pub fn set_voice_transcription_space_hold_delay_ms(&mut self, delay_ms: u64) {
+        self.composer
+            .set_voice_transcription_space_hold_delay_ms(delay_ms);
+        self.request_redraw();
+    }
+
     /// Update the key hint shown next to queued messages so it matches the
     /// binding that `ChatWidget` actually listens for.
     pub(crate) fn set_queued_message_edit_binding(&mut self, binding: Option<KeyBinding>) {
@@ -527,6 +538,15 @@ impl BottomPane {
 
     /// Forward a key event to the active view or the composer.
     pub fn handle_key_event(&mut self, key_event: KeyEvent) -> InputResult {
+        #[cfg(not(target_os = "linux"))]
+        if self.composer.is_recording() {
+            let (_result, needs_redraw) = self.composer.handle_key_event(key_event);
+            if needs_redraw {
+                self.request_redraw();
+            }
+            return InputResult::None;
+        }
+
         // If a modal/view is active, handle it here; otherwise forward to composer.
         if !self.view_stack.is_empty() {
             if key_event.kind == KeyEventKind::Release {
@@ -691,6 +711,8 @@ impl BottomPane {
     }
 
     fn pre_draw_tick_at(&mut self, now: Instant) {
+        #[cfg(not(target_os = "linux"))]
+        self.composer.process_space_hold_trigger();
         self.composer.sync_popups();
         self.maybe_show_delayed_approval_requests_at(now);
     }
@@ -1544,11 +1566,40 @@ impl BottomPane {
         updated
     }
 
+    pub(crate) fn show_transcription_retrying(
+        &mut self,
+        id: &str,
+        attempt: usize,
+        max_attempts: usize,
+    ) -> bool {
+        let updated = self
+            .composer
+            .show_transcription_retrying(id, attempt, max_attempts);
+        if updated {
+            self.composer.sync_popups();
+            self.request_redraw();
+        }
+        updated
+    }
+
+    pub(crate) fn replace_transcription(&mut self, id: &str, text: &str) -> InputResult {
+        let result = self.composer.replace_transcription(id, text);
+        self.composer.sync_popups();
+        self.request_redraw();
+        result
+    }
+
     pub(crate) fn remove_recording_meter_placeholder(&mut self, id: &str) {
         self.composer.remove_recording_meter_placeholder(id);
         self.composer.sync_popups();
         self.request_redraw();
     }
+
+    pub(crate) fn remove_transcription_placeholder(&mut self, id: &str) {
+        self.composer.remove_transcription_placeholder(id);
+        self.composer.sync_popups();
+        self.request_redraw();
+    }
 }
 
 impl Renderable for BottomPane {
diff --git a/codex-rs/tui/src/bottom_pane/textarea.rs b/codex-rs/tui/src/bottom_pane/textarea.rs
index 7ab6d38fca..fa053c5a43 100644
--- a/codex-rs/tui/src/bottom_pane/textarea.rs
+++ b/codex-rs/tui/src/bottom_pane/textarea.rs
@@ -862,7 +862,6 @@ impl TextArea {
         self.set_cursor(end);
     }
 
-    #[cfg(not(target_os = "linux"))]
     pub fn replace_element_by_id(&mut self, id: &str, text: &str) -> bool {
         if let Some(idx) = self
             .elements
diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs
index e47591a275..eaba0aea57 100644
--- a/codex-rs/tui/src/chatwidget.rs
+++ b/codex-rs/tui/src/chatwidget.rs
@@ -5539,6 +5539,14 @@ impl ChatWidget {
         if let Some(keymap) = runtime_keymap {
             widget.bottom_pane.set_keymap_bindings(&keymap);
         }
+        widget.bottom_pane.set_voice_transcription_enabled(
+            widget.config.features.enabled(Feature::VoiceTranscription),
+        );
+        widget
+            .bottom_pane
+            .set_voice_transcription_space_hold_delay_ms(
+                widget.config.voice_transcription_space_hold_delay_ms,
+            );
         widget
             .bottom_pane
             .set_realtime_conversation_enabled(widget.realtime_conversation_enabled());
@@ -5720,77 +5728,8 @@ impl ChatWidget {
             }
             _ => {
                 let had_modal_or_popup = !self.bottom_pane.no_modal_or_popup_active();
-                match self.bottom_pane.handle_key_event(key_event) {
-                    InputResult::Submitted {
-                        text,
-                        text_elements,
-                    } => {
-                        let local_images = self
-                            .bottom_pane
-                            .take_recent_submission_images_with_placeholders();
-                        let remote_image_urls = self.take_remote_image_urls();
-                        let user_message = UserMessage {
-                            text,
-                            local_images,
-                            remote_image_urls,
-                            text_elements,
-                            mention_bindings: self
-                                .bottom_pane
-                                .take_recent_submission_mention_bindings(),
-                        };
-                        if user_message.text.is_empty()
-                            && user_message.local_images.is_empty()
-                            && user_message.remote_image_urls.is_empty()
-                        {
-                            return;
-                        }
-                        let should_submit_now =
-                            self.is_session_configured() && !self.is_plan_streaming_in_tui();
-                        if should_submit_now {
-                            if self.only_user_shell_commands_running()
-                                && !user_message.text.starts_with('!')
-                            {
-                                self.queue_user_message(user_message);
-                                return;
-                            }
-                            // Submitted is emitted when user submits.
-                            // Reset any reasoning header only when we are actually submitting a turn.
-                            self.reasoning_buffer.clear();
-                            self.full_reasoning_buffer.clear();
-                            self.set_status_header(String::from("Working"));
-                            self.submit_user_message(user_message);
-                        } else {
-                            self.queue_user_message(user_message);
-                        }
-                    }
-                    InputResult::Queued {
-                        text,
-                        text_elements,
-                        action,
-                    } => {
-                        let local_images = self
-                            .bottom_pane
-                            .take_recent_submission_images_with_placeholders();
-                        let remote_image_urls = self.take_remote_image_urls();
-                        let user_message = UserMessage {
-                            text,
-                            local_images,
-                            remote_image_urls,
-                            text_elements,
-                            mention_bindings: self
-                                .bottom_pane
-                                .take_recent_submission_mention_bindings(),
-                        };
-                        self.queue_user_message_with_options(user_message, action);
-                    }
-                    InputResult::Command(cmd) => {
-                        self.handle_slash_command_dispatch(cmd);
-                    }
-                    InputResult::CommandWithArgs(cmd, args, text_elements) => {
-                        self.handle_slash_command_with_args_dispatch(cmd, args, text_elements);
-                    }
-                    InputResult::None => {}
-                }
+                let input_result = self.bottom_pane.handle_key_event(key_event);
+                self.handle_bottom_pane_input_result(input_result);
                 if had_modal_or_popup && self.bottom_pane.no_modal_or_popup_active() {
                     self.maybe_send_next_queued_input();
                 }
@@ -5800,6 +5739,74 @@ impl ChatWidget {
         self.maybe_signal_watchdog_owner_activity_if_draft_changed(&composer_before);
     }
 
+    fn user_message_from_composer_submission(
+        &mut self,
+        text: String,
+        text_elements: Vec<TextElement>,
+    ) -> UserMessage {
+        UserMessage {
+            text,
+            local_images: self
+                .bottom_pane
+                .take_recent_submission_images_with_placeholders(),
+            remote_image_urls: self.take_remote_image_urls(),
+            text_elements,
+            mention_bindings: self.bottom_pane.take_recent_submission_mention_bindings(),
+        }
+    }
+
+    fn handle_bottom_pane_input_result(&mut self, input_result: InputResult) {
+        let composer_before = self.bottom_pane.composer_text_with_pending();
+        match input_result {
+            InputResult::Submitted {
+                text,
+                text_elements,
+            } => {
+                let user_message = self.user_message_from_composer_submission(text, text_elements);
+                if user_message.text.is_empty()
+                    && user_message.local_images.is_empty()
+                    && user_message.remote_image_urls.is_empty()
+                {
+                    return;
+                }
+                let should_submit_now =
+                    self.is_session_configured() && !self.is_plan_streaming_in_tui();
+                if should_submit_now {
+                    if self.only_user_shell_commands_running()
+                        && !user_message.text.starts_with('!')
+                    {
+                        self.queue_user_message(user_message);
+                        return;
+                    }
+                    // Submitted is emitted when user submits.
+                    // Reset any reasoning header only when we are actually submitting a turn.
+                    self.reasoning_buffer.clear();
+                    self.full_reasoning_buffer.clear();
+                    self.set_status_header(String::from("Working"));
+                    self.submit_user_message(user_message);
+                } else {
+                    self.queue_user_message(user_message);
+                }
+            }
+            InputResult::Queued {
+                text,
+                text_elements,
+                action,
+            } => {
+                let user_message = self.user_message_from_composer_submission(text, text_elements);
+                self.queue_user_message_with_options(user_message, action);
+            }
+            InputResult::Command(cmd) => {
+                self.handle_slash_command_dispatch(cmd);
+            }
+            InputResult::CommandWithArgs(cmd, args, text_elements) => {
+                self.handle_slash_command_with_args_dispatch(cmd, args, text_elements);
+            }
+            InputResult::None => {}
+        }
+        self.maybe_signal_watchdog_owner_activity_if_draft_changed(&composer_before);
+    }
+
     /// Attach a local image to the composer when the active model supports image inputs.
     ///
     /// When the model does not advertise image support, we keep the draft unchanged and surface a
@@ -12137,6 +12144,13 @@ impl ChatWidget {
 
 #[cfg(not(target_os = "linux"))]
 impl ChatWidget {
+    pub(crate) fn replace_transcription(&mut self, id: &str, text: &str) {
+        let input_result = self.bottom_pane.replace_transcription(id, text);
+        self.handle_bottom_pane_input_result(input_result);
+        // Ensure the UI redraws to reflect the updated transcription.
+        self.request_redraw();
+    }
+
     pub(crate) fn update_recording_meter_in_place(&mut self, id: &str, text: &str) -> bool {
         let updated = self.bottom_pane.update_recording_meter_in_place(id, text);
         if updated {
@@ -12145,11 +12159,33 @@ impl ChatWidget {
         updated
     }
 
+    pub(crate) fn show_transcription_retrying(
+        &mut self,
+        id: &str,
+        attempt: usize,
+        max_attempts: usize,
+    ) {
+        let updated = self
+            .bottom_pane
+            .show_transcription_retrying(id, attempt, max_attempts);
+        if updated {
+            self.request_redraw();
+        }
+    }
+
     pub(crate) fn remove_recording_meter_placeholder(&mut self, id: &str) {
         self.bottom_pane.remove_recording_meter_placeholder(id);
         // Ensure the UI redraws to reflect placeholder removal.
         self.request_redraw();
     }
+
+    pub(crate) fn fail_transcription(&mut self, id: &str, error: &str) {
+        self.bottom_pane.remove_transcription_placeholder(id);
+        self.add_to_history(history_cell::new_error_event(format!(
+            "Voice transcription failed: {error}"
+        )));
+        self.request_redraw();
+    }
 }
 
 fn has_websocket_timing_metrics(summary: RuntimeMetricsSummary) -> bool {
diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs
index bfeaff2eae..3a41796be4 100644
--- a/codex-rs/tui/src/chatwidget/realtime.rs
+++ b/codex-rs/tui/src/chatwidget/realtime.rs
@@ -607,7 +607,7 @@ impl ChatWidget {
             flag.store(true, Ordering::Relaxed);
         }
         if let Some(capture) = self.realtime_conversation.capture.take() {
-            capture.stop();
+            let _ = capture.stop();
         }
         if let Some(id) = self.realtime_conversation.meter_placeholder_id.take() {
             self.remove_recording_meter_placeholder(&id);
diff --git a/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs b/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs
index 135d8dbaa4..015ae0f77c 100644
--- a/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs
+++ b/codex-rs/tui/src/chatwidget/tests/popups_and_settings.rs
@@ -42,6 +42,27 @@ async fn deleted_realtime_meter_uses_shared_stop_path() {
     );
 }
 
+#[cfg(not(target_os = "linux"))]
+#[tokio::test]
+async fn transcription_failure_keeps_draft_and_renders_error() {
+    let (mut chat, mut rx, _op_rx) = make_chatwidget_manual(/*model_override*/ None).await;
+    chat.bottom_pane
+        .set_composer_text("draft ".to_string(), Vec::new(), Vec::new());
+    let placeholder_id = chat.bottom_pane.insert_recording_meter_placeholder("⠋");
+
+    chat.fail_transcription(&placeholder_id, "boom");
+
+    assert_eq!("draft ", chat.bottom_pane.composer_text());
+    let rendered = drain_insert_history(&mut rx)
+        .into_iter()
+        .map(|lines| lines_to_single_string(&lines))
+        .collect::<Vec<_>>();
+    insta::assert_snapshot!(
+        rendered.join("\n\n"),
+        @"■ Voice transcription failed: boom"
+    );
+}
+
 #[tokio::test]
 async fn experimental_mode_plan_is_ignored_on_startup() {
     let codex_home = tempdir().expect("tempdir");
diff --git a/codex-rs/tui/src/voice.rs b/codex-rs/tui/src/voice.rs
index 229d0a8db5..1ee14b1705 100644
--- a/codex-rs/tui/src/voice.rs
+++ b/codex-rs/tui/src/voice.rs
@@ -1,33 +1,152 @@
+use crate::app_event::AppEvent;
 use crate::app_event_sender::AppEventSender;
+use crate::audio_device::preferred_input_config;
 use crate::legacy_core::config::Config;
+use crate::legacy_core::config::find_codex_home;
 use base64::Engine;
+use codex_app_server_protocol::AuthMode;
+use codex_client::build_reqwest_client_with_custom_ca;
+use codex_config::types::AuthCredentialsStoreMode;
+use codex_login::CodexAuth;
+use codex_login::default_client::get_codex_user_agent;
 use codex_protocol::protocol::ConversationAudioParams;
 use codex_protocol::protocol::RealtimeAudioFrame;
 use cpal::traits::DeviceTrait;
+use cpal::traits::HostTrait;
 use cpal::traits::StreamTrait;
+use hound::SampleFormat;
+use hound::WavSpec;
+use hound::WavWriter;
 use std::collections::VecDeque;
+use std::fmt;
+use std::future::Future;
+use std::io::Cursor;
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::sync::atomic::AtomicBool;
 use std::sync::atomic::AtomicU16;
 use std::sync::atomic::Ordering;
+use std::time::Duration;
+use std::time::Instant;
 use tracing::error;
+use tracing::info;
+use tracing::trace;
+use tracing::warn;
 
+const AUDIO_MODEL: &str = "gpt-4o-mini-transcribe";
 const MODEL_AUDIO_SAMPLE_RATE: u32 = 24_000;
 const MODEL_AUDIO_CHANNELS: u16 = 1;
+const FIRST_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT: Duration = Duration::from_secs(2);
+const FIRST_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT: Duration = Duration::from_secs(15);
+const SECOND_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT: Duration = Duration::from_secs(4);
+const SECOND_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT: Duration = Duration::from_secs(30);
+const FINAL_TRANSCRIPTION_ATTEMPT_TIMEOUT: Duration = Duration::from_secs(60);
+const FIRST_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND: f32 = 2.0;
+const SECOND_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND: f32 = 3.0;
+const TRANSCRIPTION_ATTEMPT_COUNT: usize = 3;
+
+struct TranscriptionAuthContext {
+    mode: AuthMode,
+    bearer_token: String,
+    chatgpt_account_id: Option<String>,
+    chatgpt_base_url: String,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+struct TranscriptionAttempt {
+    number: usize,
+    timeout: Duration,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+struct TranscriptionRetryNotice {
+    next_attempt: usize,
+    max_attempts: usize,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum TranscriptionRetryDecision {
+    Retry { delay: Option<Duration> },
+    Stop,
+}
+
+#[derive(Debug)]
+enum TranscriptionRequestError {
+    Build(String),
+    Timeout(Duration),
+    Send(String),
+    Status {
+        status: reqwest::StatusCode,
+        body: String,
+        retry_after: Option<Duration>,
+    },
+    Json(String),
+}
+
+impl fmt::Display for TranscriptionRequestError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Build(error) => write!(f, "{error}"),
+            Self::Timeout(timeout) => write!(
+                f,
+                "transcription request timed out after {:.2}s",
+                timeout.as_secs_f32()
+            ),
+            Self::Send(error) => write!(f, "transcription request failed: {error}"),
+            Self::Status { status, body, .. } => {
+                write!(f, "transcription failed: {status} {body}")
+            }
+            Self::Json(error) => write!(f, "failed to parse json: {error}"),
+        }
+    }
+}
+
+pub struct RecordedAudio {
+    pub data: Vec<i16>,
+    pub sample_rate: u32,
+    pub channels: u16,
+}
 
 pub struct VoiceCapture {
     stream: Option<cpal::Stream>,
+    sample_rate: u32,
+    channels: u16,
+    data: Arc<Mutex<Vec<i16>>>,
     stopped: Arc<AtomicBool>,
     last_peak: Arc<AtomicU16>,
 }
 
 impl VoiceCapture {
+    pub fn start() -> Result<Self, String> {
+        let (device, config) = select_default_input_device_and_config()?;
+
+        let sample_rate = config.sample_rate().0;
+        let channels = config.channels();
+        let data: Arc<Mutex<Vec<i16>>> = Arc::new(Mutex::new(Vec::new()));
+        let stopped = Arc::new(AtomicBool::new(false));
+        let last_peak = Arc::new(AtomicU16::new(0));
+
+        let stream = build_input_stream(&device, &config, data.clone(), last_peak.clone())?;
+        stream
+            .play()
+            .map_err(|e| format!("failed to start input stream: {e}"))?;
+
+        Ok(Self {
+            stream: Some(stream),
+            sample_rate,
+            channels,
+            data,
+            stopped,
+            last_peak,
+        })
+    }
+
     pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result<Self, String> {
         let (device, config) = select_realtime_input_device_and_config(config)?;
 
         let sample_rate = config.sample_rate().0;
         let channels = config.channels();
+        let data: Arc<Mutex<Vec<i16>>> = Arc::new(Mutex::new(Vec::new()));
         let stopped = Arc::new(AtomicBool::new(false));
         let last_peak = Arc::new(AtomicU16::new(0));
 
@@ -45,22 +164,47 @@ impl VoiceCapture {
 
         Ok(Self {
             stream: Some(stream),
+            sample_rate,
+            channels,
+            data,
             stopped,
             last_peak,
         })
     }
 
-    pub fn stop(mut self) {
+    pub fn stop(mut self) -> Result<RecordedAudio, String> {
         // Mark stopped so any metering task can exit cleanly.
         self.stopped.store(true, Ordering::SeqCst);
         // Dropping the stream stops capture.
         self.stream.take();
+        let data = self
+            .data
+            .lock()
+            .map_err(|_| "failed to lock audio buffer".to_string())?
+            .clone();
+        Ok(RecordedAudio {
+            data,
+            sample_rate: self.sample_rate,
+            channels: self.channels,
+        })
+    }
+
+    pub fn data_arc(&self) -> Arc<Mutex<Vec<i16>>> {
+        self.data.clone()
     }
 
     pub fn stopped_flag(&self) -> Arc<AtomicBool> {
         self.stopped.clone()
     }
 
+    pub fn sample_rate(&self) -> u32 {
+        self.sample_rate
+    }
+
+    pub fn channels(&self) -> u16 {
+        self.channels
+    }
+
     pub fn last_peak_arc(&self) -> Arc<AtomicU16> {
         self.last_peak.clone()
     }
@@ -125,16 +269,147 @@ impl RecordingMeterState {
     }
 }
 
+pub fn transcribe_async(
+    id: String,
+    audio: RecordedAudio,
+    context: Option<String>,
+    tx: AppEventSender,
+) {
+    std::thread::spawn(move || {
+        const MIN_DURATION_SECONDS: f32 = 1.0;
+        let duration_seconds = clip_duration_seconds(&audio);
+        if duration_seconds < MIN_DURATION_SECONDS {
+            let msg = format!(
+                "recording too short ({duration_seconds:.2}s); minimum is {MIN_DURATION_SECONDS:.2}s"
+            );
+            info!("{msg}");
+            tx.send(AppEvent::TranscriptionFailed { id, error: msg });
+            return;
+        }
+
+        let wav_bytes = match encode_wav_normalized(&audio) {
+            Ok(wav_bytes) => wav_bytes,
+            Err(err) => {
+                error!("failed to encode wav: {err}");
+                tx.send(AppEvent::TranscriptionFailed { id, error: err });
+                return;
+            }
+        };
+
+        let runtime = match tokio::runtime::Runtime::new() {
+            Ok(runtime) => runtime,
+            Err(err) => {
+                error!("failed to create tokio runtime: {err}");
+                tx.send(AppEvent::TranscriptionFailed {
+                    id,
+                    error: err.to_string(),
+                });
+                return;
+            }
+        };
+
+        let retry_tx = tx.clone();
+        let retry_id = id.clone();
+        let on_retry = move |notice: TranscriptionRetryNotice| {
+            retry_tx.send(AppEvent::TranscriptionRetrying {
+                id: retry_id.clone(),
+                attempt: notice.next_attempt,
+                max_attempts: notice.max_attempts,
+            });
+        };
+
+        match runtime.block_on(transcribe_bytes(
+            wav_bytes,
+            context,
+            duration_seconds,
+            on_retry,
+        )) {
+            Ok(text) => {
+                tx.send(AppEvent::TranscriptionComplete { id, text });
+                info!("voice transcription succeeded");
+            }
+            Err(err) => {
+                error!("voice transcription error: {err}");
+                tx.send(AppEvent::TranscriptionFailed { id, error: err });
+            }
+        }
+    });
+}
+
 // -------------------------
 // Voice input helpers
 // -------------------------
 
+fn select_default_input_device_and_config()
+-> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+    let host = cpal::default_host();
+    let device = host
+        .default_input_device()
+        .ok_or_else(|| "no input audio device available".to_string())?;
+    let config = preferred_input_config(&device)?;
+    Ok((device, config))
+}
+
 fn select_realtime_input_device_and_config(
     config: &Config,
 ) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
     crate::audio_device::select_configured_input_device_and_config(config)
 }
 
+fn build_input_stream(
+    device: &cpal::Device,
+    config: &cpal::SupportedStreamConfig,
+    data: Arc<Mutex<Vec<i16>>>,
+    last_peak: Arc<AtomicU16>,
+) -> Result<cpal::Stream, String> {
+    match config.sample_format() {
+        cpal::SampleFormat::F32 => device
+            .build_input_stream(
+                &config.clone().into(),
+                move |input: &[f32], _| {
+                    let peak = peak_f32(input);
+                    last_peak.store(peak, Ordering::Relaxed);
+                    if let Ok(mut buffer) = data.lock() {
+                        for &sample in input {
+                            buffer.push(f32_to_i16(sample));
+                        }
+                    }
+                },
+                move |err| error!("audio input error: {err}"),
+                None,
+            )
+            .map_err(|e| format!("failed to build input stream: {e}")),
+        cpal::SampleFormat::I16 => device
+            .build_input_stream(
+                &config.clone().into(),
+                move |input: &[i16], _| {
+                    let peak = peak_i16(input);
+                    last_peak.store(peak, Ordering::Relaxed);
+                    if let Ok(mut buffer) = data.lock() {
+                        buffer.extend_from_slice(input);
+                    }
+                },
+                move |err| error!("audio input error: {err}"),
+                None,
+            )
+            .map_err(|e| format!("failed to build input stream: {e}")),
+        cpal::SampleFormat::U16 => device
+            .build_input_stream(
+                &config.clone().into(),
+                move |input: &[u16], _| {
+                    if let Ok(mut buffer) = data.lock() {
+                        let peak = convert_u16_to_i16_and_peak(input, &mut buffer);
+                        last_peak.store(peak, Ordering::Relaxed);
+                    }
+                },
+                move |err| error!("audio input error: {err}"),
+                None,
+            )
+            .map_err(|e| format!("failed to build input stream: {e}")),
+        _ => Err("unsupported input sample format".to_string()),
+    }
+}
+
 fn build_realtime_input_stream(
     device: &cpal::Device,
     config: &cpal::SupportedStreamConfig,
@@ -472,10 +747,467 @@ fn convert_pcm16(
     out
 }
 
+// -------------------------
+// Transcription helpers
+// -------------------------
+
+fn clip_duration_seconds(audio: &RecordedAudio) -> f32 {
+    let total_samples = audio.data.len() as f32;
+    let samples_per_second = (audio.sample_rate as f32) * (audio.channels as f32);
+    if samples_per_second > 0.0 {
+        total_samples / samples_per_second
+    } else {
+        0.0
+    }
+}
+
+fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
+    let converted;
+    let (channels, sample_rate, segment) =
+        if audio.channels == MODEL_AUDIO_CHANNELS && audio.sample_rate == MODEL_AUDIO_SAMPLE_RATE {
+            (audio.channels, audio.sample_rate, audio.data.as_slice())
+        } else {
+            converted = convert_pcm16(
+                &audio.data,
+                audio.sample_rate,
+                audio.channels,
+                MODEL_AUDIO_SAMPLE_RATE,
+                MODEL_AUDIO_CHANNELS,
+            );
+            (
+                MODEL_AUDIO_CHANNELS,
+                MODEL_AUDIO_SAMPLE_RATE,
+                converted.as_slice(),
+            )
+        };
+
+    let spec = WavSpec {
+        channels,
+        sample_rate,
+        bits_per_sample: 16,
+        sample_format: SampleFormat::Int,
+    };
+    let mut wav_bytes = Vec::new();
+    let mut cursor = Cursor::new(&mut wav_bytes);
+    let mut writer =
+        WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?;
+
+    let peak_abs = segment
+        .iter()
+        .map(|sample| (i32::from(*sample)).unsigned_abs() as i32)
+        .max()
+        .unwrap_or(0);
+    let target = (i16::MAX as f32) * 0.9;
+    let gain = if peak_abs > 0 {
+        target / (peak_abs as f32)
+    } else {
+        1.0
+    };
+
+    for &sample in segment {
+        let normalized = ((sample as f32) * gain)
+            .round()
+            .clamp(i16::MIN as f32, i16::MAX as f32) as i16;
+        writer
+            .write_sample(normalized)
+            .map_err(|_| "failed writing wav sample".to_string())?;
+    }
+    writer
+        .finalize()
+        .map_err(|_| "failed to finalize wav".to_string())?;
+    Ok(wav_bytes)
+}
+
+fn normalize_chatgpt_base_url(input: &str) -> String {
+    let mut base_url = input.to_string();
+    while base_url.ends_with('/') {
+        base_url.pop();
+    }
+    if (base_url.starts_with("https://chatgpt.com")
+        || base_url.starts_with("https://chat.openai.com"))
+        && !base_url.contains("/backend-api")
+    {
+        base_url = format!("{base_url}/backend-api");
+    }
+    base_url
+}
+
+async fn resolve_auth() -> Result<TranscriptionAuthContext, String> {
+    let codex_home = find_codex_home().map_err(|e| format!("failed to find codex home: {e}"))?;
+    let auth = CodexAuth::from_auth_storage(&codex_home, AuthCredentialsStoreMode::Auto)
+        .await
+        .map_err(|e| format!("failed to read auth.json: {e}"))?
+        .ok_or_else(|| "No Codex auth is configured; please run `codex login`".to_string())?;
+
+    let chatgpt_account_id = auth.get_account_id();
+    let bearer_token = auth
+        .get_token()
+        .map_err(|e| format!("failed to get auth token: {e}"))?;
+    let config = Config::load_with_cli_overrides(Vec::new())
+        .await
+        .map_err(|e| format!("failed to load config: {e}"))?;
+    Ok(TranscriptionAuthContext {
+        mode: auth.api_auth_mode(),
+        bearer_token,
+        chatgpt_account_id,
+        chatgpt_base_url: normalize_chatgpt_base_url(&config.chatgpt_base_url),
+    })
+}
+
+async fn transcribe_bytes(
+    wav_bytes: Vec<u8>,
+    context: Option<String>,
+    duration_seconds: f32,
+    on_retry: impl Fn(TranscriptionRetryNotice),
+) -> Result<String, String> {
+    let started_at = Instant::now();
+    let auth = resolve_auth().await?;
+    let auth_elapsed = started_at.elapsed();
+    let client = build_reqwest_client_with_custom_ca(reqwest::Client::builder())
+        .map_err(|error| format!("failed to build transcription HTTP client: {error}"))?;
+    let audio_bytes = wav_bytes.len();
+    let prompt_for_log = context.as_deref().unwrap_or("").to_string();
+    let audio_kib = audio_bytes as f32 / 1024.0;
+    let mode = auth.mode;
+    trace!(
+        "preparing transcription request: mode={mode:?} duration={duration_seconds:.2}s audio={audio_kib:.1}KiB prompt={prompt_for_log}"
+    );
+    let value = send_transcription_request_with_retries(
+        &client,
+        &auth,
+        &wav_bytes,
+        context.as_deref(),
+        TranscriptionRequestMetrics {
+            mode,
+            duration_seconds,
+            audio_kib,
+            auth_elapsed,
+            started_at,
+        },
+        on_retry,
+    )
+    .await
+    .map_err(|error| error.to_string())?;
+
+    let text = value
+        .get("text")
+        .and_then(|text| text.as_str())
+        .unwrap_or("")
+        .to_string();
+
+    if text.is_empty() {
+        Err("empty transcription result".to_string())
+    } else {
+        Ok(text)
+    }
+}
+
+#[derive(Clone, Copy)]
+struct TranscriptionRequestMetrics {
+    mode: AuthMode,
+    duration_seconds: f32,
+    audio_kib: f32,
+    auth_elapsed: Duration,
+    started_at: Instant,
+}
+
+fn transcription_request_attempts(duration_seconds: f32) -> [TranscriptionAttempt; 3] {
+    [
+        TranscriptionAttempt {
+            number: 1,
+            timeout: scaled_transcription_request_timeout(
+                duration_seconds,
+                FIRST_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT,
+                FIRST_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT,
+                FIRST_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND,
+            ),
+        },
+        TranscriptionAttempt {
+            number: 2,
+            timeout: scaled_transcription_request_timeout(
+                duration_seconds,
+                SECOND_TRANSCRIPTION_ATTEMPT_MIN_TIMEOUT,
+                SECOND_TRANSCRIPTION_ATTEMPT_MAX_TIMEOUT,
+                SECOND_TRANSCRIPTION_ATTEMPT_TIMEOUT_PER_AUDIO_SECOND,
+            ),
+        },
+        TranscriptionAttempt {
+            number: 3,
+            timeout: FINAL_TRANSCRIPTION_ATTEMPT_TIMEOUT,
+        },
+    ]
+}
+
+fn scaled_transcription_request_timeout(
+    duration_seconds: f32,
+    min_timeout: Duration,
+    max_timeout: Duration,
+    timeout_per_audio_second: f32,
+) -> Duration {
+    let scaled_timeout = if duration_seconds.is_finite() && duration_seconds > 0.0 {
+        Duration::from_secs_f32(duration_seconds * timeout_per_audio_second)
+    } else {
+        min_timeout
+    };
+
+    scaled_timeout.clamp(min_timeout, max_timeout)
+}
+
+fn build_transcription_request(
+    client: &reqwest::Client,
+    auth: &TranscriptionAuthContext,
+    wav_bytes: &[u8],
+    context: Option<&str>,
+) -> Result<(String, reqwest::RequestBuilder), TranscriptionRequestError> {
+    if matches!(auth.mode, AuthMode::Chatgpt | AuthMode::ChatgptAuthTokens) {
+        let part = reqwest::multipart::Part::bytes(wav_bytes.to_vec())
+            .file_name("audio.wav")
+            .mime_str("audio/wav")
+            .map_err(|error| {
+                TranscriptionRequestError::Build(format!("failed to set mime: {error}"))
+            })?;
+        let form = reqwest::multipart::Form::new().part("file", part);
+        let endpoint = format!("{}/transcribe", auth.chatgpt_base_url);
+        let request = if let Some(account_id) = &auth.chatgpt_account_id {
+            client
+                .post(&endpoint)
+                .bearer_auth(&auth.bearer_token)
+                .multipart(form)
+                .header("User-Agent", get_codex_user_agent())
+                .header("ChatGPT-Account-Id", account_id.as_str())
+        } else {
+            client
+                .post(&endpoint)
+                .bearer_auth(&auth.bearer_token)
+                .multipart(form)
+                .header("User-Agent", get_codex_user_agent())
+        };
+        Ok((endpoint, request))
+    } else {
+        let part = reqwest::multipart::Part::bytes(wav_bytes.to_vec())
+            .file_name("audio.wav")
+            .mime_str("audio/wav")
+            .map_err(|error| {
+                TranscriptionRequestError::Build(format!("failed to set mime: {error}"))
+            })?;
+        let form = if let Some(context) = context {
+            reqwest::multipart::Form::new()
+                .text("model", AUDIO_MODEL)
+                .part("file", part)
+                .text("prompt", context.to_string())
+        } else {
+            reqwest::multipart::Form::new()
+                .text("model", AUDIO_MODEL)
+                .part("file", part)
+        };
+        let endpoint = "https://api.openai.com/v1/audio/transcriptions".to_string();
+        Ok((
+            endpoint.clone(),
+            client
+                .post(&endpoint)
+                .bearer_auth(&auth.bearer_token)
+                .multipart(form)
+                .header("User-Agent", get_codex_user_agent()),
+        ))
+    }
+}
+
+async fn send_transcription_request_with_retries(
+    client: &reqwest::Client,
+    auth: &TranscriptionAuthContext,
+    wav_bytes: &[u8],
+    context: Option<&str>,
+    metrics: TranscriptionRequestMetrics,
+    on_retry: impl Fn(TranscriptionRetryNotice),
+) -> Result<serde_json::Value, TranscriptionRequestError> {
+    let attempts = transcription_request_attempts(metrics.duration_seconds);
+    let mut last_error = None;
+
+    for attempt_index in 0..attempts.len() {
+        let attempt = attempts[attempt_index];
+        let next_attempt = attempts.get(attempt_index + 1).copied();
+        let (endpoint, request) = build_transcription_request(client, auth, wav_bytes, context)?;
+        info!(
+            "sending voice transcription request: mode={:?} endpoint={endpoint} attempt={}/{} duration={:.2}s audio={:.1}KiB timeout={:.2}s auth_config_elapsed_ms={}",
+            metrics.mode,
+            attempt.number,
+            TRANSCRIPTION_ATTEMPT_COUNT,
+            metrics.duration_seconds,
+            metrics.audio_kib,
+            attempt.timeout.as_secs_f32(),
+            metrics.auth_elapsed.as_millis()
+        );
+
+        let request_started_at = Instant::now();
+        match send_transcription_request_with_timeout(request, attempt.timeout).await {
+            Ok(value) => {
+                let request_elapsed = request_started_at.elapsed();
+                info!(
+                    "voice transcription response parsed: attempt={}/{} request_elapsed_ms={} total_elapsed_ms={}",
+                    attempt.number,
+                    TRANSCRIPTION_ATTEMPT_COUNT,
+                    request_elapsed.as_millis(),
+                    metrics.started_at.elapsed().as_millis()
+                );
+                return Ok(value);
+            }
+            Err(error) => {
+                let request_elapsed = request_started_at.elapsed();
+                match transcription_retry_decision(&error, next_attempt) {
+                    TranscriptionRetryDecision::Retry { delay } => {
+                        warn!(
+                            "voice transcription attempt failed; retrying: attempt={}/{} request_elapsed_ms={} total_elapsed_ms={} error={error}",
+                            attempt.number,
+                            TRANSCRIPTION_ATTEMPT_COUNT,
+                            request_elapsed.as_millis(),
+                            metrics.started_at.elapsed().as_millis()
+                        );
+                        on_retry(TranscriptionRetryNotice {
+                            next_attempt: attempt.number + 1,
+                            max_attempts: TRANSCRIPTION_ATTEMPT_COUNT,
+                        });
+                        if let Some(delay) = delay {
+                            info!(
+                                "waiting before voice transcription retry: retry_after_ms={}",
+                                delay.as_millis()
+                            );
+                            tokio::time::sleep(delay).await;
+                        }
+                        last_error = Some(error);
+                    }
+                    TranscriptionRetryDecision::Stop => {
+                        warn!(
+                            "voice transcription attempt failed; giving up: attempt={}/{} request_elapsed_ms={} total_elapsed_ms={} error={error}",
+                            attempt.number,
+                            TRANSCRIPTION_ATTEMPT_COUNT,
+                            request_elapsed.as_millis(),
+                            metrics.started_at.elapsed().as_millis()
+                        );
+                        return Err(error);
+                    }
+                }
+            }
+        }
+    }
+
+    Err(last_error.unwrap_or_else(|| {
+        TranscriptionRequestError::Build("no transcription attempts configured".to_string())
+    }))
+}
+
+fn transcription_retry_decision(
+    error: &TranscriptionRequestError,
+    next_attempt: Option<TranscriptionAttempt>,
+) -> TranscriptionRetryDecision {
+    let Some(next_attempt) = next_attempt else {
+        return TranscriptionRetryDecision::Stop;
+    };
+
+    match error {
+        TranscriptionRequestError::Timeout(_) | TranscriptionRequestError::Send(_) => {
+            TranscriptionRetryDecision::Retry { delay: None }
+        }
+        TranscriptionRequestError::Status { status, .. }
+            if matches!(
+                *status,
+                reqwest::StatusCode::BAD_GATEWAY
+                    | reqwest::StatusCode::SERVICE_UNAVAILABLE
+                    | reqwest::StatusCode::GATEWAY_TIMEOUT
+            ) =>
+        {
+            TranscriptionRetryDecision::Retry { delay: None }
+        }
+        TranscriptionRequestError::Status {
+            status,
+            retry_after,
+            ..
+        } if *status == reqwest::StatusCode::TOO_MANY_REQUESTS => match retry_after {
+            Some(delay) if *delay <= next_attempt.timeout => TranscriptionRetryDecision::Retry {
+                delay: Some(*delay),
+            },
+            Some(_) => TranscriptionRetryDecision::Stop,
+            None => TranscriptionRetryDecision::Retry { delay: None },
+        },
+        TranscriptionRequestError::Build(_)
+        | TranscriptionRequestError::Status { .. }
+        | TranscriptionRequestError::Json(_) => TranscriptionRetryDecision::Stop,
+    }
+}
+
+fn retry_after_duration(headers: &reqwest::header::HeaderMap) -> Option<Duration> {
+    headers
+        .get(reqwest::header::RETRY_AFTER)
+        .and_then(|value| value.to_str().ok())
+        .and_then(|value| value.parse::<u64>().ok())
+        .map(Duration::from_secs)
+}
+
+async fn send_transcription_request_with_timeout(
+    request: reqwest::RequestBuilder,
+    timeout: Duration,
+) -> Result<serde_json::Value, TranscriptionRequestError> {
+    // Use an explicit async deadline because reqwest otherwise has no end-to-end request timeout
+    // on this client builder.
+    with_transcription_timeout(send_transcription_request(request), timeout).await
+}
+
+async fn with_transcription_timeout<F, T>(
+    future: F,
+    timeout: Duration,
+) -> Result<T, TranscriptionRequestError>
+where
+    F: Future<Output = Result<T, TranscriptionRequestError>>,
+{
+    match tokio::time::timeout(timeout, future).await {
+        Ok(result) => result,
+        Err(_) => Err(TranscriptionRequestError::Timeout(timeout)),
+    }
+}
+
+async fn send_transcription_request(
+    request: reqwest::RequestBuilder,
+) -> Result<serde_json::Value, TranscriptionRequestError> {
+    let response = request
+        .send()
+        .await
+        .map_err(|error| TranscriptionRequestError::Send(error.to_string()))?;
+    if !response.status().is_success() {
+        let status = response.status();
+        let retry_after = retry_after_duration(response.headers());
+        let body = response
+            .text()
+            .await
+            .unwrap_or_else(|_| "<failed to read body>".to_string());
+        return Err(TranscriptionRequestError::Status {
+            status,
+            body,
+            retry_after,
+        });
+    }
+
+    let value: serde_json::Value = response
+        .json()
+        .await
+        .map_err(|error| TranscriptionRequestError::Json(error.to_string()))?;
+    Ok(value)
+}
+
 #[cfg(test)]
 mod tests {
+    use super::RecordedAudio;
+    use super::TranscriptionAttempt;
+    use super::TranscriptionRequestError;
+    use super::TranscriptionRetryDecision;
     use super::convert_pcm16;
+    use super::encode_wav_normalized;
+    use super::send_transcription_request_with_timeout;
+    use super::transcription_request_attempts;
+    use super::transcription_retry_decision;
     use pretty_assertions::assert_eq;
+    use std::io::Cursor;
+    use std::net::Ipv4Addr;
+    use std::time::Duration;
+    use tokio::time;
 
     #[test]
     fn convert_pcm16_downmixes_and_resamples_for_model_input() {
@@ -486,4 +1218,182 @@ mod tests {
         );
         assert_eq!(converted, vec![200, 700]);
     }
+
+    #[test]
+    fn encode_wav_normalized_outputs_24khz_mono_audio() {
+        let audio = RecordedAudio {
+            data: vec![100, -100, 200, -200],
+            sample_rate: 48_000,
+            channels: 2,
+        };
+
+        let bytes = encode_wav_normalized(&audio).unwrap();
+        let reader = hound::WavReader::new(Cursor::new(bytes)).unwrap();
+        let spec = reader.spec();
+
+        assert_eq!(spec.channels, 1);
+        assert_eq!(spec.sample_rate, 24_000);
+        assert_eq!(spec.bits_per_sample, 16);
+    }
+
+    #[test]
+    fn transcription_request_attempts_scale_with_audio_duration() {
+        let actual = [0.0, f32::NAN, 1.0, 5.0, 10.0, 20.0]
+            .into_iter()
+            .map(|duration_seconds| {
+                transcription_request_attempts(duration_seconds).map(|attempt| attempt.timeout)
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(
+            actual,
+            vec![
+                [
+                    Duration::from_secs(2),
+                    Duration::from_secs(4),
+                    Duration::from_secs(60),
+                ],
+                [
+                    Duration::from_secs(2),
+                    Duration::from_secs(4),
+                    Duration::from_secs(60),
+                ],
+                [
+                    Duration::from_secs(2),
+                    Duration::from_secs(4),
+                    Duration::from_secs(60),
+                ],
+                [
+                    Duration::from_secs(10),
+                    Duration::from_secs(15),
+                    Duration::from_secs(60),
+                ],
+                [
+                    Duration::from_secs(15),
+                    Duration::from_secs(30),
+                    Duration::from_secs(60),
+                ],
+                [
+                    Duration::from_secs(15),
+                    Duration::from_secs(30),
+                    Duration::from_secs(60),
+                ],
+            ]
+        );
+    }
+
+    #[test]
+    fn transcription_retry_decision_retries_only_transient_failures() {
+        let next_attempt = Some(TranscriptionAttempt {
+            number: 2,
+            timeout: Duration::from_secs(4),
+        });
+
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Timeout(Duration::from_secs(2)),
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Retry { delay: None }
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Send("connection reset".to_string()),
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Retry { delay: None }
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Status {
+                    status: reqwest::StatusCode::BAD_GATEWAY,
+                    body: "bad gateway".to_string(),
+                    retry_after: None,
+                },
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Retry { delay: None }
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Status {
+                    status: reqwest::StatusCode::TOO_MANY_REQUESTS,
+                    body: "slow down".to_string(),
+                    retry_after: Some(Duration::from_secs(3)),
+                },
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Retry {
+                delay: Some(Duration::from_secs(3))
+            }
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Status {
+                    status: reqwest::StatusCode::TOO_MANY_REQUESTS,
+                    body: "slow down".to_string(),
+                    retry_after: Some(Duration::from_secs(5)),
+                },
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Stop
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Status {
+                    status: reqwest::StatusCode::UNAUTHORIZED,
+                    body: "no".to_string(),
+                    retry_after: None,
+                },
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Stop
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Json("invalid".to_string()),
+                next_attempt
+            ),
+            TranscriptionRetryDecision::Stop
+        );
+        assert_eq!(
+            transcription_retry_decision(
+                &TranscriptionRequestError::Timeout(Duration::from_secs(60)),
+                None
+            ),
+            TranscriptionRetryDecision::Stop
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn transcription_request_times_out_unresponsive_endpoint() {
+        let listener = tokio::net::TcpListener::bind((Ipv4Addr::LOCALHOST, 0))
+            .await
+            .unwrap();
+        let url = format!("http://{}/transcribe", listener.local_addr().unwrap());
+        let server = tokio::spawn(async move {
+            let (_socket, _) = listener.accept().await.unwrap();
+            std::future::pending::<()>().await;
+        });
+
+        let timeout = Duration::from_secs(10);
+        let task = tokio::spawn(send_transcription_request_with_timeout(
+            reqwest::Client::new().get(url),
+            timeout,
+        ));
+        tokio::task::yield_now().await;
+        time::advance(timeout).await;
+
+        let err = time::timeout(Duration::from_millis(1), task)
+            .await
+            .unwrap()
+            .unwrap()
+            .unwrap_err();
+        server.abort();
+
+        assert_eq!(
+            err.to_string(),
+            "transcription request timed out after 10.00s"
+        );
+    }
 }
diff --git a/docs/tui-chat-composer.md b/docs/tui-chat-composer.md
index 0ad5c693b3..f4b02cc7e6 100644
--- a/docs/tui-chat-composer.md
+++ b/docs/tui-chat-composer.md
@@ -84,6 +84,7 @@ Flags:
 - `popups_enabled`
 - `slash_commands_enabled`
 - `image_paste_enabled`
+- `voice_transcription_space_hold_delay_ms`
 
 Key effects when disabled:
 
@@ -91,6 +92,8 @@ Key effects when disabled:
 - When `slash_commands_enabled` is `false`, the composer does not treat `/...` input as commands.
 - When `slash_commands_enabled` is `false`, slash-context paste-burst exceptions are disabled.
 - When `image_paste_enabled` is `false`, file-path paste image attachment is skipped.
+- `voice_transcription_space_hold_delay_ms` only affects the non-empty-composer Space hold path;
+  empty composers still start voice capture immediately.
 - `ChatWidget` may toggle `image_paste_enabled` at runtime based on the selected model's
   `input_modalities`; attach and submit paths also re-check support and emit a warning instead of
   dropping the draft.
@@ -120,6 +123,14 @@ the input starts with `!` (shell command).
 The same preparation path is reused for slash commands with arguments (for example `/plan` and
 `/review`) so pasted content and text elements are preserved when extracting args.
 
+If a voice transcription placeholder is still resolving, `handle_submission` records whether the
+user pressed Enter (submit) or Tab (queue while a task is running, submit otherwise) and leaves the
+draft visible. While that pending transcription submission exists, key edits are ignored so the
+submitted draft cannot drift from what the user committed. When transcription completes, the
+placeholder is replaced with the produced text and the normal `prepare_submission_text` path runs.
+If transcription fails, the placeholder is removed, the draft remains editable, and the UI renders
+a transcription error instead of submitting anything.
+
 The composer also treats the textarea kill buffer as separate editing state from the visible draft.
 After submit or slash-command dispatch clears the textarea, the most recent `Ctrl+K` payload is
 still available for `Ctrl+Y`. This supports flows where a user kills part of a draft, runs a