tui: make voice hold delay configurable

Add a config value for the Space hold threshold, wire it into both TUI composer implementations, and document the default behavior. Also mirror the voice transcription feature key and Space-hold delay in config-schema, and keep placeholder replacement available on Linux for the voice path. (cherry picked from commit 9fa36434d566153cc69d2fabac095e313d93be51) (cherry picked from commit 749a0f42619c8cb66514c0bce220b775c10916e3)
2026-06-01 19:02:59 +00:00 · 2026-04-23 17:20:10 -04:00
parent a7c5f106cb
commit be5f34a898
18 changed files with 2295 additions and 78 deletions
--- a/codex-rs/core/config.schema.json
+++ b/codex-rs/core/config.schema.json
@@ -590,6 +590,9 @@
            "use_linux_sandbox_bwrap": {
              "type": "boolean"
            },
+            "voice_transcription": {
+              "type": "boolean"
+            },
            "web_search": {
              "type": "boolean"
            },
@@ -3633,6 +3636,9 @@
        "use_linux_sandbox_bwrap": {
          "type": "boolean"
        },
+        "voice_transcription": {
+          "type": "boolean"
+        },
        "web_search": {
          "type": "boolean"
        },
@@ -4043,6 +4049,12 @@
      ],
      "description": "Collection of settings that are specific to the TUI."
    },
+    "voice_transcription_space_hold_delay_ms": {
+      "description": "Delay before holding Space on a non-empty composer switches into voice transcription instead of inserting a literal space.",
+      "format": "uint64",
+      "minimum": 0.0,
+      "type": "integer"
+    },
    "watchdog_interval_s": {
      "description": "Watchdog polling interval in seconds.",
      "format": "int64",
--- a/codex-rs/core/src/config/config_tests.rs
+++ b/codex-rs/core/src/config/config_tests.rs
@@ -6045,6 +6045,8 @@ async fn test_precedence_fixture_with_o3_profile() -> std::io::Result<()> {
            personality: Some(Personality::Pragmatic),
            chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
            realtime_audio: RealtimeAudioConfig::default(),
+            voice_transcription_space_hold_delay_ms:
+                DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
            experimental_realtime_start_instructions: None,
            experimental_realtime_ws_base_url: None,
            experimental_realtime_ws_model: None,
@@ -6244,6 +6246,7 @@ async fn test_precedence_fixture_with_gpt3_profile() -> std::io::Result<()> {
        personality: Some(Personality::Pragmatic),
        chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
        realtime_audio: RealtimeAudioConfig::default(),
+        voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
        experimental_realtime_start_instructions: None,
        experimental_realtime_ws_base_url: None,
        experimental_realtime_ws_model: None,
@@ -6397,6 +6400,7 @@ async fn test_precedence_fixture_with_zdr_profile() -> std::io::Result<()> {
        personality: Some(Personality::Pragmatic),
        chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
        realtime_audio: RealtimeAudioConfig::default(),
+        voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
        experimental_realtime_start_instructions: None,
        experimental_realtime_ws_base_url: None,
        experimental_realtime_ws_model: None,
@@ -6535,6 +6539,7 @@ async fn test_precedence_fixture_with_gpt5_profile() -> std::io::Result<()> {
        personality: Some(Personality::Pragmatic),
        chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
        realtime_audio: RealtimeAudioConfig::default(),
+        voice_transcription_space_hold_delay_ms: DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS,
        experimental_realtime_start_instructions: None,
        experimental_realtime_ws_base_url: None,
        experimental_realtime_ws_model: None,
@@ -8677,6 +8682,29 @@ speaker = "Desk Speakers"
    Ok(())
 }

+#[tokio::test]
+async fn voice_transcription_space_hold_delay_loads_from_config_toml() -> std::io::Result<()> {
+    let cfg: ConfigToml = toml::from_str(
+        r#"
+voice_transcription_space_hold_delay_ms = 250
+"#,
+    )
+    .expect("TOML deserialization should succeed");
+
+    assert_eq!(cfg.voice_transcription_space_hold_delay_ms, Some(250));
+
+    let codex_home = TempDir::new()?;
+    let config = Config::load_from_base_config_with_overrides(
+        cfg,
+        ConfigOverrides::default(),
+        codex_home.abs(),
+    )
+    .await?;
+
+    assert_eq!(config.voice_transcription_space_hold_delay_ms, 250);
+    Ok(())
+}
+
 #[derive(Deserialize, Debug, PartialEq)]
 struct TuiTomlTest {
    #[serde(default, flatten)]
--- a/codex-rs/core/src/config/mod.rs
+++ b/codex-rs/core/src/config/mod.rs
@@ -168,6 +168,8 @@ pub(crate) const DEFAULT_AGENT_MAX_DEPTH: i32 = 1;
 pub(crate) const DEFAULT_AGENT_JOB_MAX_RUNTIME_SECONDS: Option<u64> = None;
 const LOCAL_DEV_BUILD_VERSION: &str = "0.0.0";
 pub(crate) const DEFAULT_WATCHDOG_INTERVAL_S: i64 = 10;
+/// Default delay before holding Space on a non-empty composer switches into voice transcription.
+pub const DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS: u64 = 1_000;

 pub const CONFIG_TOML_FILE: &str = "config.toml";

@@ -672,6 +674,10 @@ pub struct Config {
    /// Machine-local realtime audio device preferences used by realtime voice.
    pub realtime_audio: RealtimeAudioConfig,

+    /// Delay before holding Space on a non-empty composer switches into voice
+    /// transcription instead of inserting a literal space.
+    pub voice_transcription_space_hold_delay_ms: u64,
+
    /// Experimental / do not use. Overrides only the realtime conversation
    /// websocket transport base URL (the `Op::RealtimeConversation`
    /// `/v1/realtime`
@@ -2804,6 +2810,9 @@ impl Config {
                    microphone: audio.microphone,
                    speaker: audio.speaker,
                }),
+            voice_transcription_space_hold_delay_ms: cfg
+                .voice_transcription_space_hold_delay_ms
+                .unwrap_or(DEFAULT_VOICE_TRANSCRIPTION_SPACE_HOLD_DELAY_MS),
            experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url,
            experimental_realtime_ws_model: cfg.experimental_realtime_ws_model,
            realtime: cfg