voice transcription (#3381)

Adds voice transcription on press-and-hold of spacebar. https://github.com/user-attachments/assets/85039314-26f3-46d1-a83b-8c4a4a1ecc21 --------- Co-authored-by: Codex <199175422+chatgpt-codex-connector[bot]@users.noreply.github.com> Co-authored-by: David Zbarsky <zbarsky@openai.com>
2026-02-23 14:15:18 -08:00 · 2026-02-23 14:15:18 -08:00 · 855e275591
commit 855e275591
parent 50953ea39a
17 changed files with 2538 additions and 446 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,4 +1,11 @@
 load("@apple_support//xcode:xcode_config.bzl", "xcode_config")
+load("@rules_cc//cc:defs.bzl", "cc_shared_library")
+
+cc_shared_library(
+    name = "clang",
+    deps = ["@llvm-project//clang:libclang"],
+    visibility = ["//visibility:public"],
+)

 xcode_config(name = "disable_xcode")

--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -1,5 +1,7 @@
+module(name = "codex")
+
 bazel_dep(name = "platforms", version = "1.0.0")
-bazel_dep(name = "toolchains_llvm_bootstrapped", version = "0.5.3")
+bazel_dep(name = "toolchains_llvm_bootstrapped", version = "0.5.6")
 single_version_override(
    module_name = "toolchains_llvm_bootstrapped",
    patch_strip = 1,
@ -8,6 +10,8 @@ single_version_override(
    ],
 )

+register_toolchains("@toolchains_llvm_bootstrapped//toolchain:all")
+
 osx = use_extension("@toolchains_llvm_bootstrapped//extensions:osx.bzl", "osx")
 osx.framework(name = "ApplicationServices")
 osx.framework(name = "AppKit")
@ -16,8 +20,12 @@ osx.framework(name = "CoreFoundation")
 osx.framework(name = "CoreGraphics")
 osx.framework(name = "CoreServices")
 osx.framework(name = "CoreText")
+osx.framework(name = "AudioToolbox")
 osx.framework(name = "CFNetwork")
 osx.framework(name = "FontServices")
+osx.framework(name = "AudioUnit")
+osx.framework(name = "CoreAudio")
+osx.framework(name = "CoreAudioTypes")
 osx.framework(name = "Foundation")
 osx.framework(name = "ImageIO")
 osx.framework(name = "IOKit")
@ -25,10 +33,7 @@ osx.framework(name = "Kernel")
 osx.framework(name = "OSLog")
 osx.framework(name = "Security")
 osx.framework(name = "SystemConfiguration")
-
-register_toolchains(
-    "@toolchains_llvm_bootstrapped//toolchain:all",
-)
+use_repo(osx, "macosx15.4.sdk")

 # Needed to disable xcode...
 bazel_dep(name = "apple_support", version = "2.1.0")
@ -39,9 +44,9 @@ bazel_dep(name = "rules_rs", version = "0.0.23")
 # Special toolchains branch
 archive_override(
    module_name = "rules_rs",
-    integrity = "sha256-YbDRjZos4UmfIPY98znK1BgBWRQ1/ui3CtL6RqxE30I=",
-    strip_prefix = "rules_rs-6cf3d940fdc48baf3ebd6c37daf8e0be8fc73ecb",
-    url = "https://github.com/dzbarsky/rules_rs/archive/6cf3d940fdc48baf3ebd6c37daf8e0be8fc73ecb.tar.gz",
+    integrity = "sha256-O34UF4H7b1Qacu3vlu2Od4ILGVApzg5j1zl952SFL3w=",
+    strip_prefix = "rules_rs-097123c2aa72672e371e69e7035869f5a45c7b2b",
+    url = "https://github.com/dzbarsky/rules_rs/archive/097123c2aa72672e371e69e7035869f5a45c7b2b.tar.gz",
 )

 rules_rust = use_extension("@rules_rs//rs/experimental:rules_rust.bzl", "rules_rust")
@ -134,6 +139,9 @@ crate.annotation(
        "OPENSSL_NO_VENDOR": "1",
        "OPENSSL_STATIC": "1",
    },
+    crate_features = [
+        "dep:openssl-src",
+    ],
    crate = "openssl-sys",
    data = ["@openssl//:gen_dir"],
 )
@ -145,6 +153,28 @@ crate.annotation(
    workspace_cargo_toml = "rust/runfiles/Cargo.toml",
 )

+llvm = use_extension("@toolchains_llvm_bootstrapped//extensions:llvm.bzl", "llvm")
+use_repo(llvm, "llvm-project")
+
+crate.annotation(
+    # Provide the hermetic SDK path so the build script doesn't try to invoke an unhermetic `xcrun --show-sdk-path`.
+    build_script_data = [
+        "@macosx15.4.sdk//sysroot",
+    ],
+    build_script_env = {
+        "BINDGEN_EXTRA_CLANG_ARGS": "-isystem $(location @toolchains_llvm_bootstrapped//:builtin_headers)",
+        "COREAUDIO_SDK_PATH": "$(location @macosx15.4.sdk//sysroot)",
+        "LIBCLANG_PATH": "$(location @codex//:clang)",
+    },
+    build_script_tools = [
+        "@codex//:clang",
+        "@toolchains_llvm_bootstrapped//:builtin_headers",
+    ],
+    crate = "coreaudio-sys",
+)
+
+inject_repo(crate, "codex", "toolchains_llvm_bootstrapped", "macosx15.4.sdk")
+
 # Fix readme inclusions
 crate.annotation(
    crate = "windows-link",
@ -175,6 +205,17 @@ crate.annotation(
    gen_build_script = "off",
    deps = [":windows_import_lib"],
 )
+
+bazel_dep(name = "alsa_lib", version = "1.2.9.bcr.4")
+
+crate.annotation(
+    crate = "alsa-sys",
+    gen_build_script = "off",
+    deps = ["@alsa_lib"],
+)
+
+inject_repo(crate, "alsa_lib")
+
 use_repo(crate, "crates")

 rbe_platform_repository = use_repo_rule("//:rbe.bzl", "rbe_platform_repository")
--- a/MODULE.bazel.lock
+++ b/MODULE.bazel.lock
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
--- a/codex-rs/core/config.schema.json
+++ b/codex-rs/core/config.schema.json
@ -415,6 +415,9 @@
            "use_linux_sandbox_bwrap": {
              "type": "boolean"
            },
+            "voice_transcription": {
+              "type": "boolean"
+            },
            "web_search": {
              "type": "boolean"
            },
@ -1696,6 +1699,9 @@
        "use_linux_sandbox_bwrap": {
          "type": "boolean"
        },
+        "voice_transcription": {
+          "type": "boolean"
+        },
        "web_search": {
          "type": "boolean"
        },
--- a/codex-rs/core/src/features.rs
+++ b/codex-rs/core/src/features.rs
@ -138,6 +138,8 @@ pub enum Feature {
    CollaborationModes,
    /// Enable personality selection in the TUI.
    Personality,
+    /// Enable voice transcription in the TUI composer.
+    VoiceTranscription,
    /// Prevent idle system sleep while a turn is actively running.
    PreventIdleSleep,
    /// Use the Responses API WebSocket transport for OpenAI by default.
@ -627,6 +629,12 @@ pub const FEATURES: &[FeatureSpec] = &[
        stage: Stage::Stable,
        default_enabled: true,
    },
+    FeatureSpec {
+        id: Feature::VoiceTranscription,
+        key: "voice_transcription",
+        stage: Stage::UnderDevelopment,
+        default_enabled: false,
+    },
    FeatureSpec {
        id: Feature::PreventIdleSleep,
        key: "prevent_idle_sleep",
--- a/codex-rs/login/src/server.rs
+++ b/codex-rs/login/src/server.rs
@ -431,7 +431,7 @@ fn build_authorize_url(
        ("redirect_uri".to_string(), redirect_uri.to_string()),
        (
            "scope".to_string(),
-            "openid profile email offline_access".to_string(),
+            "openid profile email offline_access api.model.audio.request".to_string(),
        ),
        (
            "code_challenge".to_string(),
--- a/codex-rs/tui/Cargo.toml
+++ b/codex-rs/tui/Cargo.toml
@ -13,10 +13,12 @@ name = "codex_tui"
 path = "src/lib.rs"

 [features]
+default = ["voice-input"]
 # Enable vt100-based tests (emulator) when running with `--features vt100-tests`.
 vt100-tests = []
 # Gate verbose debug logging inside the TUI implementation.
 debug-logs = []
+voice-input = ["dep:cpal", "dep:hound"]

 [lints]
 workspace = true
@ -68,7 +70,7 @@ ratatui = { workspace = true, features = [
 ] }
 ratatui-macros = { workspace = true }
 regex-lite = { workspace = true }
-reqwest = { version = "0.12", features = ["json"] }
+reqwest = { workspace = true, features = ["json", "multipart"] }
 rmcp = { workspace = true }
 serde = { workspace = true, features = ["derive"] }
 serde_json = { workspace = true, features = ["preserve_order"] }
@ -104,6 +106,10 @@ uuid = { workspace = true }
 codex-windows-sandbox = { workspace = true }
 tokio-util = { workspace = true, features = ["time"] }

+[target.'cfg(not(target_os = "linux"))'.dependencies]
+cpal = { version = "0.15", optional = true }
+hound = { version = "3.5", optional = true }
+
 [target.'cfg(unix)'.dependencies]
 libc = { workspace = true }

--- a/codex-rs/tui/src/app.rs
+++ b/codex-rs/tui/src/app.rs
@ -1547,6 +1547,8 @@ impl App {
                    {
                        return Ok(AppRunControl::Continue);
                    }
+                    // Allow widgets to process any pending timers before rendering.
+                    self.chat_widget.pre_draw_tick();
                    tui.draw(
                        self.chat_widget.desired_height(tui.terminal.size()?.width),
                        |frame| {
@ -2703,6 +2705,22 @@ impl App {
                    ));
                }
            },
+            #[cfg(not(target_os = "linux"))]
+            AppEvent::TranscriptionComplete { id, text } => {
+                self.chat_widget.replace_transcription(&id, &text);
+            }
+            #[cfg(not(target_os = "linux"))]
+            AppEvent::TranscriptionFailed { id, error: _ } => {
+                self.chat_widget.remove_transcription_placeholder(&id);
+            }
+            #[cfg(not(target_os = "linux"))]
+            AppEvent::UpdateRecordingMeter { id, text } => {
+                // Update in place to preserve the element id for subsequent frames.
+                let updated = self.chat_widget.update_transcription_in_place(&id, &text);
+                if updated {
+                    tui.frame_requester().schedule_frame();
+                }
+            }
            AppEvent::StatusLineSetup { items } => {
                let ids = items.iter().map(ToString::to_string).collect::<Vec<_>>();
                let edit = codex_core::config::edit::status_line_items_edit(&ids);
@ -3106,7 +3124,7 @@ impl App {
                self.chat_widget.handle_key_event(key_event);
            }
            _ => {
-                // Ignore Release key events.
+                self.chat_widget.handle_key_event(key_event);
            }
        };
    }
--- a/codex-rs/tui/src/app_event.rs
+++ b/codex-rs/tui/src/app_event.rs
@ -321,6 +321,29 @@ pub(crate) enum AppEvent {
    /// Re-open the permissions presets popup.
    OpenPermissionsPopup,

+    /// Live update for the in-progress voice recording placeholder. Carries
+    /// the placeholder `id` and the text to display (e.g., an ASCII meter).
+    #[cfg(not(target_os = "linux"))]
+    UpdateRecordingMeter {
+        id: String,
+        text: String,
+    },
+
+    /// Voice transcription finished for the given placeholder id.
+    #[cfg(not(target_os = "linux"))]
+    TranscriptionComplete {
+        id: String,
+        text: String,
+    },
+
+    /// Voice transcription failed; remove the placeholder identified by `id`.
+    #[cfg(not(target_os = "linux"))]
+    TranscriptionFailed {
+        id: String,
+        #[allow(dead_code)]
+        error: String,
+    },
+
    /// Open the branch picker option from the review popup.
    OpenReviewBranchPicker(PathBuf),

--- a/codex-rs/tui/src/bottom_pane/chat_composer.rs
+++ b/codex-rs/tui/src/bottom_pane/chat_composer.rs
@ -109,6 +109,17 @@
 //! edits and renders a placeholder prompt instead of the editable textarea. This is part of the
 //! overall state machine, since it affects which transitions are even possible from a given UI
 //! state.
+//!
+//! # Voice Hold-To-Talk Without Key Release
+//!
+//! On terminals that do not report `KeyEventKind::Release`, space hold-to-talk uses repeated
+//! space key events as "still held" evidence:
+//!
+//! - For pending holds (non-empty composer), if timeout elapses without any repeated space event,
+//!   we treat the key as a normal typed space.
+//! - If repeated space events are seen before timeout, we proceed with hold-to-talk.
+//! - While recording, repeated space events keep the recording alive; if they stop for a short
+//!   window, we stop and transcribe.
 use crate::bottom_pane::footer::mode_indicator_line;
 use crate::key_hint;
 use crate::key_hint::KeyBinding;
@ -191,6 +202,7 @@ use crate::bottom_pane::textarea::TextAreaState;
 use crate::clipboard_paste::normalize_pasted_path;
 use crate::clipboard_paste::pasted_image_format;
 use crate::history_cell;
+use crate::tui::FrameRequester;
 use crate::ui_consts::LIVE_PREFIX_COLS;
 use codex_chatgpt::connectors;
 use codex_chatgpt::connectors::AppInfo;
@ -202,9 +214,17 @@ use std::collections::HashSet;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::sync::Arc;
+#[cfg(not(target_os = "linux"))]
+use std::sync::Mutex;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+#[cfg(not(target_os = "linux"))]
+use std::thread;
 use std::time::Duration;
 use std::time::Instant;
-
+#[cfg(not(target_os = "linux"))]
+use tokio::runtime::Handle;
 /// If the pasted content exceeds this number of characters, replace it with a
 /// placeholder in the UI.
 const LARGE_PASTE_CHAR_THRESHOLD: usize = 1000;
@ -284,6 +304,35 @@ impl ChatComposerConfig {
        }
    }
 }
+
+#[derive(Default)]
+struct VoiceState {
+    transcription_enabled: bool,
+    // Spacebar hold-to-talk state.
+    space_hold_started_at: Option<Instant>,
+    space_hold_element_id: Option<String>,
+    space_hold_trigger: Option<Arc<AtomicBool>>,
+    key_release_supported: bool,
+    space_hold_repeat_seen: bool,
+    #[cfg(not(target_os = "linux"))]
+    voice: Option<crate::voice::VoiceCapture>,
+    #[cfg(not(target_os = "linux"))]
+    recording_placeholder_id: Option<String>,
+    #[cfg(not(target_os = "linux"))]
+    space_recording_started_at: Option<Instant>,
+    #[cfg(not(target_os = "linux"))]
+    space_recording_last_repeat_at: Option<Instant>,
+}
+
+impl VoiceState {
+    fn new(key_release_supported: bool) -> Self {
+        Self {
+            key_release_supported,
+            ..Default::default()
+        }
+    }
+}
+
 pub(crate) struct ChatComposer {
    textarea: TextArea,
    textarea_state: RefCell<TextAreaState>,
@ -299,10 +348,14 @@ pub(crate) struct ChatComposer {
    pending_pastes: Vec<(String, String)>,
    large_paste_counters: HashMap<usize, usize>,
    has_focus: bool,
+    frame_requester: Option<FrameRequester>,
    /// Invariant: attached images are labeled in vec order as
    /// `[Image #M+1]..[Image #N]`, where `M` is the number of remote images.
    attached_images: Vec<AttachedImage>,
    placeholder_text: String,
+    voice_state: VoiceState,
+    // Spinner control flags keyed by placeholder id; set to true to stop.
+    spinner_stop_flags: HashMap<String, Arc<AtomicBool>>,
    is_task_running: bool,
    /// When false, the composer is temporarily read-only (e.g. during sandbox setup).
    input_enabled: bool,
@ -320,6 +373,9 @@ pub(crate) struct ChatComposer {
    selected_remote_image_index: Option<usize>,
    footer_flash: Option<FooterFlash>,
    context_window_percent: Option<i64>,
+    // Monotonically increasing identifier for textarea elements we insert.
+    #[cfg(not(target_os = "linux"))]
+    next_element_id: u64,
    context_window_used_tokens: Option<i64>,
    skills: Option<Vec<SkillMetadata>>,
    connectors_snapshot: Option<ConnectorsSnapshot>,
@ -407,8 +463,11 @@ impl ChatComposer {
            pending_pastes: Vec::new(),
            large_paste_counters: HashMap::new(),
            has_focus: has_input_focus,
+            frame_requester: None,
            attached_images: Vec::new(),
            placeholder_text,
+            voice_state: VoiceState::new(enhanced_keys_supported),
+            spinner_stop_flags: HashMap::new(),
            is_task_running: false,
            input_enabled: true,
            input_disabled_placeholder: None,
@ -421,6 +480,8 @@ impl ChatComposer {
            selected_remote_image_index: None,
            footer_flash: None,
            context_window_percent: None,
+            #[cfg(not(target_os = "linux"))]
+            next_element_id: 0,
            context_window_used_tokens: None,
            skills: None,
            connectors_snapshot: None,
@ -442,6 +503,17 @@ impl ChatComposer {
        this
    }

+    #[cfg(not(target_os = "linux"))]
+    fn next_id(&mut self) -> String {
+        let id = self.next_element_id;
+        self.next_element_id = self.next_element_id.wrapping_add(1);
+        id.to_string()
+    }
+
+    pub(crate) fn set_frame_requester(&mut self, frame_requester: FrameRequester) {
+        self.frame_requester = Some(frame_requester);
+    }
+
    pub fn set_skill_mentions(&mut self, skills: Option<Vec<SkillMetadata>>) {
        self.skills = skills;
    }
@ -505,6 +577,23 @@ impl ChatComposer {
    pub fn set_personality_command_enabled(&mut self, enabled: bool) {
        self.personality_command_enabled = enabled;
    }
+
+    pub fn set_voice_transcription_enabled(&mut self, enabled: bool) {
+        self.voice_state.transcription_enabled = enabled;
+        if !enabled {
+            self.voice_state.space_hold_started_at = None;
+            if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                let _ = self.textarea.replace_element_by_id(&id, " ");
+            }
+            self.voice_state.space_hold_trigger = None;
+            self.voice_state.space_hold_repeat_seen = false;
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn voice_transcription_enabled(&self) -> bool {
+        self.voice_state.transcription_enabled && cfg!(not(target_os = "linux"))
+    }
    /// Centralized feature gating keeps config checks out of call sites.
    fn popups_enabled(&self) -> bool {
        self.config.popups_enabled
@ -568,6 +657,20 @@ impl ChatComposer {
        }
    }

+    pub fn cursor_pos(&self, area: Rect) -> Option<(u16, u16)> {
+        if !self.input_enabled {
+            return None;
+        }
+
+        // Hide the cursor while recording voice input.
+        #[cfg(not(target_os = "linux"))]
+        if self.voice_state.voice.is_some() {
+            return None;
+        }
+        let [_, _, textarea_rect, _] = self.layout_areas(area);
+        let state = *self.textarea_state.borrow();
+        self.textarea.cursor_pos_with_state(textarea_rect, state)
+    }
    /// Returns true if the composer currently contains no user-entered input.
    pub(crate) fn is_empty(&self) -> bool {
        self.textarea.is_empty()
@ -621,6 +724,10 @@ impl ChatComposer {
    /// In all cases, clears any paste-burst Enter suppression state so a real paste cannot affect
    /// the next user Enter key, then syncs popup state.
    pub fn handle_paste(&mut self, pasted: String) -> bool {
+        #[cfg(not(target_os = "linux"))]
+        if self.voice_state.voice.is_some() {
+            return false;
+        }
        let pasted = pasted.replace("\r\n", "\n").replace('\r', "\n");
        let char_count = pasted.chars().count();
        if char_count > LARGE_PASTE_CHAR_THRESHOLD {
@ -633,9 +740,8 @@ impl ChatComposer {
        {
            self.textarea.insert_str(" ");
        } else {
-            self.textarea.insert_str(&pasted);
+            self.insert_str(&pasted);
        }
-        // Explicit paste events should not trigger Enter suppression.
        self.paste_burst.clear_after_explicit_paste();
        self.sync_popups();
        true
@ -866,6 +972,9 @@ impl ChatComposer {
        local_image_paths: Vec<PathBuf>,
        mention_bindings: Vec<MentionBinding>,
    ) {
+        #[cfg(not(target_os = "linux"))]
+        self.stop_all_transcription_spinners();
+
        // Clear any existing content, placeholders, and attachments first.
        self.textarea.set_text_clearing_elements("");
        self.pending_pastes.clear();
@ -1124,20 +1233,56 @@ impl ChatComposer {

    /// Handle a key event coming from the main UI.
    pub fn handle_key_event(&mut self, key_event: KeyEvent) -> (InputResult, bool) {
+        if matches!(key_event.kind, KeyEventKind::Release) {
+            self.voice_state.key_release_supported = true;
+        }
+
+        // Timer-based conversion is handled in the pre-draw tick.
+        // If recording, stop on Space release when supported. On terminals without key-release
+        // events, Space repeat events are handled as "still held" and stop is driven by timeout
+        // in `process_space_hold_trigger`.
+        if let Some(result) = self.handle_key_event_while_recording(key_event) {
+            return result;
+        }
+
        if !self.input_enabled {
            return (InputResult::None, false);
        }

+        // Outside of recording, ignore all key releases globally except for Space,
+        // which is handled explicitly for hold-to-talk behavior below.
+        if matches!(key_event.kind, KeyEventKind::Release)
+            && !matches!(key_event.code, KeyCode::Char(' '))
+        {
+            return (InputResult::None, false);
+        }
+
+        // If a space hold is pending and another non-space key is pressed, cancel the hold
+        // and convert the element into a plain space.
+        if self.voice_state.space_hold_started_at.is_some()
+            && !matches!(key_event.code, KeyCode::Char(' '))
+        {
+            self.voice_state.space_hold_started_at = None;
+            if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                let _ = self.textarea.replace_element_by_id(&id, " ");
+            }
+            self.voice_state.space_hold_trigger = None;
+            self.voice_state.space_hold_repeat_seen = false;
+            // fall through to normal handling of this other key
+        }
+
+        if let Some(result) = self.handle_voice_space_key_event(&key_event) {
+            return result;
+        }
+
        let result = match &mut self.active_popup {
            ActivePopup::Command(_) => self.handle_key_event_with_slash_popup(key_event),
            ActivePopup::File(_) => self.handle_key_event_with_file_popup(key_event),
            ActivePopup::Skill(_) => self.handle_key_event_with_skill_popup(key_event),
            ActivePopup::None => self.handle_key_event_without_popup(key_event),
        };
-
        // Update (or hide/show) popup after processing the key.
        self.sync_popups();
-
        result
    }

@ -2535,6 +2680,7 @@ impl ChatComposer {
            // -------------------------------------------------------------
            KeyEvent {
                code: KeyCode::Up | KeyCode::Down,
+                kind: KeyEventKind::Press | KeyEventKind::Repeat,
                ..
            }
            | KeyEvent {
@ -2588,6 +2734,136 @@ impl ChatComposer {
        }
    }

+    #[cfg(target_os = "linux")]
+    fn handle_voice_space_key_event(
+        &mut self,
+        _key_event: &KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        None
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn handle_voice_space_key_event(
+        &mut self,
+        key_event: &KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        if !self.voice_transcription_enabled() || !matches!(key_event.code, KeyCode::Char(' ')) {
+            return None;
+        }
+        match key_event.kind {
+            KeyEventKind::Press => {
+                if self.paste_burst.is_active() {
+                    return None;
+                }
+
+                // If textarea is empty, start recording immediately without inserting a space.
+                if self.textarea.text().is_empty() {
+                    if self.start_recording_with_placeholder() {
+                        return Some((InputResult::None, true));
+                    }
+                    return None;
+                }
+
+                // If a hold is already pending, swallow further press events to
+                // avoid inserting multiple spaces and resetting the timer on key repeat.
+                if self.voice_state.space_hold_started_at.is_some() {
+                    if !self.voice_state.key_release_supported {
+                        self.voice_state.space_hold_repeat_seen = true;
+                    }
+                    return Some((InputResult::None, false));
+                }
+
+                // Insert a named element that renders as a space so we can later
+                // remove it on timeout or convert it to a plain space on release.
+                let elem_id = self.next_id();
+                self.textarea.insert_named_element(" ", elem_id.clone());
+
+                // Record pending hold metadata.
+                self.voice_state.space_hold_started_at = Some(Instant::now());
+                self.voice_state.space_hold_element_id = Some(elem_id);
+                self.voice_state.space_hold_repeat_seen = false;
+
+                // Spawn a delayed task to flip an atomic flag; we check it on next key event.
+                let flag = Arc::new(AtomicBool::new(false));
+                let frame = self.frame_requester.clone();
+                Self::schedule_space_hold_timer(flag.clone(), frame);
+                self.voice_state.space_hold_trigger = Some(flag);
+
+                Some((InputResult::None, true))
+            }
+            // If we see a repeat before release, handling occurs in the top-level pending block.
+            KeyEventKind::Repeat => {
+                // Swallow repeats while a hold is pending to avoid extra spaces.
+                if self.voice_state.space_hold_started_at.is_some() {
+                    if !self.voice_state.key_release_supported {
+                        self.voice_state.space_hold_repeat_seen = true;
+                    }
+                    return Some((InputResult::None, false));
+                }
+                // Fallback: if no pending hold, treat as normal input.
+                None
+            }
+            // Space release without pending (fallback): treat as normal input.
+            KeyEventKind::Release => {
+                // If a hold is pending, convert the element to a plain space and clear state.
+                self.voice_state.space_hold_started_at = None;
+                if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                    let _ = self.textarea.replace_element_by_id(&id, " ");
+                }
+                self.voice_state.space_hold_trigger = None;
+                self.voice_state.space_hold_repeat_seen = false;
+                Some((InputResult::None, true))
+            }
+        }
+    }
+
+    #[cfg(target_os = "linux")]
+    fn handle_key_event_while_recording(
+        &mut self,
+        _key_event: KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        None
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn handle_key_event_while_recording(
+        &mut self,
+        key_event: KeyEvent,
+    ) -> Option<(InputResult, bool)> {
+        if self.voice_state.voice.is_some() {
+            let should_stop = if self.voice_state.key_release_supported {
+                match key_event.kind {
+                    KeyEventKind::Release => matches!(key_event.code, KeyCode::Char(' ')),
+                    KeyEventKind::Press | KeyEventKind::Repeat => {
+                        !matches!(key_event.code, KeyCode::Char(' '))
+                    }
+                }
+            } else {
+                match key_event.kind {
+                    KeyEventKind::Release => matches!(key_event.code, KeyCode::Char(' ')),
+                    KeyEventKind::Press | KeyEventKind::Repeat => {
+                        if matches!(key_event.code, KeyCode::Char(' ')) {
+                            self.voice_state.space_recording_last_repeat_at = Some(Instant::now());
+                            false
+                        } else {
+                            true
+                        }
+                    }
+                }
+            };
+
+            if should_stop {
+                let needs_redraw = self.stop_recording_and_start_transcription();
+                return Some((InputResult::None, needs_redraw));
+            }
+
+            // Swallow non-stopping keys while recording.
+            return Some((InputResult::None, false));
+        }
+
+        None
+    }
+
    fn is_bang_shell_command(&self) -> bool {
        self.textarea.text().trim_start().starts_with('!')
    }
@ -2607,8 +2883,6 @@ impl ChatComposer {
                true
            }
            FlushResult::Typed(ch) => {
-                // Mirror insert_str() behavior so popups stay in sync when a
-                // pending fast char flushes as normal typed input.
                self.textarea.insert_str(ch.to_string().as_str());
                self.sync_popups();
                true
@ -2632,6 +2906,12 @@ impl ChatComposer {
    ///   otherwise `clear_window_after_non_char()` can leave buffered text waiting without a
    ///   timestamp to time out against.
    fn handle_input_basic(&mut self, input: KeyEvent) -> (InputResult, bool) {
+        // Ignore key releases here to avoid treating them as additional input
+        // (e.g., appending the same character twice via paste-burst logic).
+        if !matches!(input.kind, KeyEventKind::Press | KeyEventKind::Repeat) {
+            return (InputResult::None, false);
+        }
+
        self.handle_input_basic_with_time(input, Instant::now())
    }

@ -2897,7 +3177,7 @@ impl ChatComposer {
            .map(|items| if items.is_empty() { 0 } else { 1 })
    }

-    fn sync_popups(&mut self) {
+    pub(crate) fn sync_popups(&mut self) {
        self.sync_slash_command_elements();
        if !self.popups_enabled() {
            self.active_popup = ActivePopup::None;
@ -3311,6 +3591,11 @@ impl ChatComposer {
        self.has_focus = has_focus;
    }

+    #[cfg(not(target_os = "linux"))]
+    pub(crate) fn is_recording(&self) -> bool {
+        self.voice_state.voice.is_some()
+    }
+
    #[allow(dead_code)]
    pub(crate) fn set_input_enabled(&mut self, enabled: bool, placeholder: Option<String>) {
        self.input_enabled = enabled;
@ -3344,6 +3629,32 @@ impl ChatComposer {
        }
    }

+    #[cfg(not(target_os = "linux"))]
+    fn schedule_space_hold_timer(flag: Arc<AtomicBool>, frame: Option<FrameRequester>) {
+        const HOLD_DELAY_MILLIS: u64 = 500;
+        if let Ok(handle) = Handle::try_current() {
+            let flag_clone = flag;
+            let frame_clone = frame;
+            handle.spawn(async move {
+                tokio::time::sleep(Duration::from_millis(HOLD_DELAY_MILLIS)).await;
+                Self::complete_space_hold_timer(flag_clone, frame_clone);
+            });
+        } else {
+            thread::spawn(move || {
+                thread::sleep(Duration::from_millis(HOLD_DELAY_MILLIS));
+                Self::complete_space_hold_timer(flag, frame);
+            });
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    fn complete_space_hold_timer(flag: Arc<AtomicBool>, frame: Option<FrameRequester>) {
+        flag.store(true, Ordering::Relaxed);
+        if let Some(frame) = frame {
+            frame.schedule_frame();
+        }
+    }
+
    pub(crate) fn set_status_line(&mut self, status_line: Option<Line<'static>>) -> bool {
        if self.status_line_value == status_line {
            return false;
@ -3361,6 +3672,280 @@ impl ChatComposer {
    }
 }

+#[cfg(not(target_os = "linux"))]
+impl ChatComposer {
+    pub(crate) fn process_space_hold_trigger(&mut self) {
+        if self.voice_transcription_enabled()
+            && let Some(flag) = self.voice_state.space_hold_trigger.as_ref()
+            && flag.load(Ordering::Relaxed)
+            && self.voice_state.space_hold_started_at.is_some()
+            && self.voice_state.voice.is_none()
+        {
+            let _ = self.on_space_hold_timeout();
+        }
+
+        const SPACE_REPEAT_INITIAL_GRACE_MILLIS: u64 = 700;
+        const SPACE_REPEAT_IDLE_TIMEOUT_MILLIS: u64 = 250;
+        if !self.voice_state.key_release_supported && self.voice_state.voice.is_some() {
+            let now = Instant::now();
+            let initial_grace = Duration::from_millis(SPACE_REPEAT_INITIAL_GRACE_MILLIS);
+            let repeat_idle_timeout = Duration::from_millis(SPACE_REPEAT_IDLE_TIMEOUT_MILLIS);
+            if let Some(started_at) = self.voice_state.space_recording_started_at
+                && now.saturating_duration_since(started_at) >= initial_grace
+            {
+                let should_stop = match self.voice_state.space_recording_last_repeat_at {
+                    Some(last_repeat_at) => {
+                        now.saturating_duration_since(last_repeat_at) >= repeat_idle_timeout
+                    }
+                    None => true,
+                };
+                if should_stop {
+                    let _ = self.stop_recording_and_start_transcription();
+                }
+            }
+        }
+    }
+
+    /// Called when the 500ms space hold timeout elapses.
+    ///
+    /// On terminals without key-release reporting, this only transitions into voice capture if we
+    /// observed repeated Space events while pending; otherwise the keypress is treated as a typed
+    /// space.
+    pub(crate) fn on_space_hold_timeout(&mut self) -> bool {
+        if !self.voice_transcription_enabled() {
+            return false;
+        }
+        if self.voice_state.voice.is_some() {
+            return false;
+        }
+        if self.voice_state.space_hold_started_at.is_some() {
+            if !self.voice_state.key_release_supported && !self.voice_state.space_hold_repeat_seen {
+                if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                    let _ = self.textarea.replace_element_by_id(&id, " ");
+                }
+                self.voice_state.space_hold_started_at = None;
+                self.voice_state.space_hold_trigger = None;
+                self.voice_state.space_hold_repeat_seen = false;
+                return true;
+            }
+
+            // Preserve the typed space when transitioning into voice capture, but
+            // avoid duplicating an existing trailing space. In either case,
+            // convert/remove the temporary named element before inserting the
+            // recording/transcribing placeholder.
+            if let Some(id) = self.voice_state.space_hold_element_id.take() {
+                let replacement = if self
+                    .textarea
+                    .named_element_range(&id)
+                    .and_then(|range| self.textarea.text()[..range.start].chars().next_back())
+                    .is_some_and(|ch| ch == ' ')
+                {
+                    ""
+                } else {
+                    " "
+                };
+                let _ = self.textarea.replace_element_by_id(&id, replacement);
+            }
+            // Clear pending state before starting capture
+            self.voice_state.space_hold_started_at = None;
+            self.voice_state.space_hold_trigger = None;
+            self.voice_state.space_hold_repeat_seen = false;
+
+            // Start voice capture
+            self.start_recording_with_placeholder()
+        } else {
+            false
+        }
+    }
+
+    /// Stop recording if active, update the placeholder, and spawn background transcription.
+    /// Returns true if the UI should redraw.
+    fn stop_recording_and_start_transcription(&mut self) -> bool {
+        let Some(vc) = self.voice_state.voice.take() else {
+            return false;
+        };
+        self.voice_state.space_recording_started_at = None;
+        self.voice_state.space_recording_last_repeat_at = None;
+        match vc.stop() {
+            Ok(audio) => {
+                // If the recording is too short, remove the placeholder immediately
+                // and skip the transcribing state entirely.
+                let total_samples = audio.data.len() as f32;
+                let samples_per_second = (audio.sample_rate as f32) * (audio.channels as f32);
+                let duration_seconds = if samples_per_second > 0.0 {
+                    total_samples / samples_per_second
+                } else {
+                    0.0
+                };
+                const MIN_DURATION_SECONDS: f32 = 1.0;
+                if duration_seconds < MIN_DURATION_SECONDS {
+                    if let Some(id) = self.voice_state.recording_placeholder_id.take() {
+                        let _ = self.textarea.replace_element_by_id(&id, "");
+                    }
+                    return true;
+                }
+
+                // Otherwise, update the placeholder to show a spinner and proceed.
+                let id = match self.voice_state.recording_placeholder_id.take() {
+                    Some(id) => id,
+                    None => self.next_id(),
+                };
+
+                let placeholder_range = self.textarea.named_element_range(&id);
+                let prompt_source = if let Some(range) = &placeholder_range {
+                    self.textarea.text()[..range.start].to_string()
+                } else {
+                    self.textarea.text().to_string()
+                };
+
+                // Initialize with first spinner frame immediately.
+                let _ = self.textarea.update_named_element_by_id(&id, "⠋");
+                // Spawn animated braille spinner until transcription finishes (or times out).
+                self.spawn_transcribing_spinner(id.clone());
+                let tx = self.app_event_tx.clone();
+                crate::voice::transcribe_async(id, audio, Some(prompt_source), tx);
+                true
+            }
+            Err(e) => {
+                tracing::error!("failed to stop voice capture: {e}");
+                true
+            }
+        }
+    }
+
+    /// Start voice capture and insert a placeholder element for the live meter.
+    /// Returns true if recording began and UI should redraw; false on failure.
+    fn start_recording_with_placeholder(&mut self) -> bool {
+        match crate::voice::VoiceCapture::start() {
+            Ok(vc) => {
+                self.voice_state.voice = Some(vc);
+                if self.voice_state.key_release_supported {
+                    self.voice_state.space_recording_started_at = None;
+                } else {
+                    self.voice_state.space_recording_started_at = Some(Instant::now());
+                }
+                self.voice_state.space_recording_last_repeat_at = None;
+                // Insert visible placeholder for the meter (no label)
+                let id = self.next_id();
+                self.textarea.insert_named_element("", id.clone());
+                self.voice_state.recording_placeholder_id = Some(id);
+                // Spawn metering animation
+                if let Some(v) = &self.voice_state.voice {
+                    let data = v.data_arc();
+                    let stop = v.stopped_flag();
+                    let sr = v.sample_rate();
+                    let ch = v.channels();
+                    let peak = v.last_peak_arc();
+                    if let Some(idref) = &self.voice_state.recording_placeholder_id {
+                        self.spawn_recording_meter(idref.clone(), sr, ch, data, peak, stop);
+                    }
+                }
+                true
+            }
+            Err(e) => {
+                self.voice_state.space_recording_started_at = None;
+                self.voice_state.space_recording_last_repeat_at = None;
+                tracing::error!("failed to start voice capture: {e}");
+                false
+            }
+        }
+    }
+
+    fn spawn_recording_meter(
+        &self,
+        id: String,
+        _sample_rate: u32,
+        _channels: u16,
+        _data: Arc<Mutex<Vec<i16>>>,
+        last_peak: Arc<std::sync::atomic::AtomicU16>,
+        stop: Arc<std::sync::atomic::AtomicBool>,
+    ) {
+        let tx = self.app_event_tx.clone();
+        let task = move || {
+            use std::time::Duration;
+            let mut meter = crate::voice::RecordingMeterState::new();
+            loop {
+                if stop.load(Ordering::Relaxed) {
+                    break;
+                }
+                let text = meter.next_text(last_peak.load(Ordering::Relaxed));
+                tx.send(crate::app_event::AppEvent::UpdateRecordingMeter {
+                    id: id.clone(),
+                    text,
+                });
+
+                thread::sleep(Duration::from_millis(100));
+            }
+        };
+
+        if let Ok(handle) = Handle::try_current() {
+            handle.spawn_blocking(task);
+        } else {
+            thread::spawn(task);
+        }
+    }
+
+    fn spawn_transcribing_spinner(&mut self, id: String) {
+        self.stop_transcription_spinner(&id);
+        let stop = Arc::new(AtomicBool::new(false));
+        self.spinner_stop_flags
+            .insert(id.clone(), Arc::clone(&stop));
+
+        let tx = self.app_event_tx.clone();
+        let task = move || {
+            use std::time::Duration;
+            let frames: Vec<&'static str> = vec!["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
+            let mut i: usize = 0;
+            // Safety stop after ~60s to avoid a runaway task if events are lost.
+            let max_ticks = 600usize; // 600 * 100ms = 60s
+            for _ in 0..max_ticks {
+                if stop.load(Ordering::Relaxed) {
+                    break;
+                }
+                let text = frames[i % frames.len()].to_string();
+                tx.send(crate::app_event::AppEvent::UpdateRecordingMeter {
+                    id: id.clone(),
+                    text,
+                });
+                i = i.wrapping_add(1);
+                thread::sleep(Duration::from_millis(100));
+            }
+        };
+
+        if let Ok(handle) = Handle::try_current() {
+            handle.spawn_blocking(task);
+        } else {
+            thread::spawn(task);
+        }
+    }
+
+    fn stop_transcription_spinner(&mut self, id: &str) {
+        if let Some(flag) = self.spinner_stop_flags.remove(id) {
+            flag.store(true, Ordering::Relaxed);
+        }
+    }
+
+    fn stop_all_transcription_spinners(&mut self) {
+        for (_id, flag) in self.spinner_stop_flags.drain() {
+            flag.store(true, Ordering::Relaxed);
+        }
+    }
+
+    pub fn replace_transcription(&mut self, id: &str, text: &str) {
+        self.stop_transcription_spinner(id);
+        let _ = self.textarea.replace_element_by_id(id, text);
+    }
+
+    pub fn update_transcription_in_place(&mut self, id: &str, text: &str) -> bool {
+        self.textarea.update_named_element_by_id(id, text)
+    }
+
+    pub fn remove_transcription_placeholder(&mut self, id: &str) {
+        self.stop_transcription_spinner(id);
+        let _ = self.textarea.replace_element_by_id(id, "");
+    }
+}
+
 fn skill_display_name(skill: &SkillMetadata) -> &str {
    skill
        .interface
@ -3787,6 +4372,15 @@ fn prompt_selection_action(
    }
 }

+impl Drop for ChatComposer {
+    fn drop(&mut self) {
+        // Stop any running spinner tasks.
+        for (_id, flag) in self.spinner_stop_flags.drain() {
+            flag.store(true, Ordering::Relaxed);
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -5527,11 +6121,19 @@ mod tests {
    fn type_chars_humanlike(composer: &mut ChatComposer, chars: &[char]) {
        use crossterm::event::KeyCode;
        use crossterm::event::KeyEvent;
+        use crossterm::event::KeyEventKind;
        use crossterm::event::KeyModifiers;
        for &ch in chars {
            let _ = composer.handle_key_event(KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE));
            std::thread::sleep(ChatComposer::recommended_paste_flush_delay());
            let _ = composer.flush_paste_burst_if_due();
+            if ch == ' ' {
+                let _ = composer.handle_key_event(KeyEvent::new_with_kind(
+                    KeyCode::Char(' '),
+                    KeyModifiers::NONE,
+                    KeyEventKind::Release,
+                ));
+            }
        }
    }

@ -5621,6 +6223,195 @@ mod tests {
        assert!(found_error, "expected error history cell to be sent");
    }

+    #[test]
+    fn voice_transcription_disabled_treats_space_as_normal_input() {
+        use crossterm::event::KeyCode;
+        use crossterm::event::KeyEvent;
+        use crossterm::event::KeyEventKind;
+        use crossterm::event::KeyModifiers;
+
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            true,
+            sender,
+            false,
+            "Ask Codex to do anything".to_string(),
+            true,
+        );
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+
+        let _ = composer.handle_key_event(KeyEvent::new(KeyCode::Char(' '), KeyModifiers::NONE));
+        let _ = composer.handle_key_event(KeyEvent::new_with_kind(
+            KeyCode::Char(' '),
+            KeyModifiers::NONE,
+            KeyEventKind::Release,
+        ));
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(composer.voice_state.space_hold_element_id.is_none());
+        assert!(composer.voice_state.space_hold_trigger.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn space_hold_timeout_without_release_or_repeat_keeps_typed_space() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            true,
+            sender,
+            false,
+            "Ask Codex to do anything".to_string(),
+            false,
+        );
+        composer.set_voice_transcription_enabled(true);
+
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+        let elem_id = "space-hold".to_string();
+        composer.textarea.insert_named_element(" ", elem_id.clone());
+        composer.voice_state.space_hold_started_at = Some(Instant::now());
+        composer.voice_state.space_hold_element_id = Some(elem_id);
+        composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true)));
+        composer.voice_state.key_release_supported = false;
+        composer.voice_state.space_hold_repeat_seen = false;
+        assert_eq!("x ", composer.textarea.text());
+
+        composer.process_space_hold_trigger();
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn space_hold_timeout_with_repeat_uses_hold_path_without_release() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            true,
+            sender,
+            false,
+            "Ask Codex to do anything".to_string(),
+            false,
+        );
+        composer.set_voice_transcription_enabled(true);
+
+        composer.set_text_content("x".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+        let elem_id = "space-hold".to_string();
+        composer.textarea.insert_named_element(" ", elem_id.clone());
+        composer.voice_state.space_hold_started_at = Some(Instant::now());
+        composer.voice_state.space_hold_element_id = Some(elem_id);
+        composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true)));
+        composer.voice_state.key_release_supported = false;
+        composer.voice_state.space_hold_repeat_seen = true;
+
+        composer.process_space_hold_trigger();
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+        if composer.is_recording() {
+            let _ = composer.stop_recording_and_start_transcription();
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn space_hold_timeout_with_repeat_does_not_duplicate_existing_space() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            true,
+            sender,
+            false,
+            "Ask Codex to do anything".to_string(),
+            false,
+        );
+        composer.set_voice_transcription_enabled(true);
+
+        composer.set_text_content("x ".to_string(), Vec::new(), Vec::new());
+        composer.move_cursor_to_end();
+        let elem_id = "space-hold".to_string();
+        composer.textarea.insert_named_element(" ", elem_id.clone());
+        composer.voice_state.space_hold_started_at = Some(Instant::now());
+        composer.voice_state.space_hold_element_id = Some(elem_id);
+        composer.voice_state.space_hold_trigger = Some(Arc::new(AtomicBool::new(true)));
+        composer.voice_state.key_release_supported = false;
+        composer.voice_state.space_hold_repeat_seen = true;
+
+        composer.process_space_hold_trigger();
+
+        assert_eq!("x ", composer.textarea.text());
+        assert!(composer.voice_state.space_hold_started_at.is_none());
+        assert!(!composer.voice_state.space_hold_repeat_seen);
+        if composer.is_recording() {
+            let _ = composer.stop_recording_and_start_transcription();
+        }
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn replace_transcription_stops_spinner_for_placeholder() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            true,
+            sender,
+            false,
+            "Ask Codex to do anything".to_string(),
+            false,
+        );
+
+        let id = "voice-placeholder".to_string();
+        composer.textarea.insert_named_element("", id.clone());
+        let flag = Arc::new(AtomicBool::new(false));
+        composer
+            .spinner_stop_flags
+            .insert(id.clone(), Arc::clone(&flag));
+
+        composer.replace_transcription(&id, "transcribed text");
+
+        assert!(flag.load(Ordering::Relaxed));
+        assert!(!composer.spinner_stop_flags.contains_key(&id));
+        assert_eq!(composer.textarea.text(), "transcribed text");
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    #[test]
+    fn set_text_content_stops_all_transcription_spinners() {
+        let (tx, _rx) = unbounded_channel::<AppEvent>();
+        let sender = AppEventSender::new(tx);
+        let mut composer = ChatComposer::new(
+            true,
+            sender,
+            false,
+            "Ask Codex to do anything".to_string(),
+            false,
+        );
+
+        let flag_one = Arc::new(AtomicBool::new(false));
+        let flag_two = Arc::new(AtomicBool::new(false));
+        composer
+            .spinner_stop_flags
+            .insert("voice-1".to_string(), Arc::clone(&flag_one));
+        composer
+            .spinner_stop_flags
+            .insert("voice-2".to_string(), Arc::clone(&flag_two));
+
+        composer.set_text_content("draft".to_string(), Vec::new(), Vec::new());
+
+        assert!(flag_one.load(Ordering::Relaxed));
+        assert!(flag_two.load(Ordering::Relaxed));
+        assert!(composer.spinner_stop_flags.is_empty());
+    }
+
    #[test]
    fn extract_args_supports_quoted_paths_single_arg() {
        let args = extract_positional_args_for_prompt_line(
--- a/codex-rs/tui/src/bottom_pane/mod.rs
+++ b/codex-rs/tui/src/bottom_pane/mod.rs
@ -33,6 +33,7 @@ use codex_protocol::request_user_input::RequestUserInputEvent;
 use codex_protocol::user_input::TextElement;
 use crossterm::event::KeyCode;
 use crossterm::event::KeyEvent;
+use crossterm::event::KeyEventKind;
 use ratatui::buffer::Buffer;
 use ratatui::layout::Rect;
 use ratatui::text::Line;
@ -204,8 +205,8 @@ impl BottomPane {
            placeholder_text,
            disable_paste_burst,
        );
+        composer.set_frame_requester(frame_requester.clone());
        composer.set_skill_mentions(skills);
-
        Self {
            composer,
            view_stack: Vec::new(),
@ -291,6 +292,11 @@ impl BottomPane {
        self.request_redraw();
    }

+    pub fn set_voice_transcription_enabled(&mut self, enabled: bool) {
+        self.composer.set_voice_transcription_enabled(enabled);
+        self.request_redraw();
+    }
+
    /// Update the key hint shown next to queued messages so it matches the
    /// binding that `ChatWidget` actually listens for.
    pub(crate) fn set_queued_message_edit_binding(&mut self, binding: KeyBinding) {
@ -327,8 +333,23 @@ impl BottomPane {

    /// Forward a key event to the active view or the composer.
    pub fn handle_key_event(&mut self, key_event: KeyEvent) -> InputResult {
+        // Do not globally intercept space; only composer handles hold-to-talk.
+        // While recording, route all keys to the composer so it can stop on release or next key.
+        #[cfg(not(target_os = "linux"))]
+        if self.composer.is_recording() {
+            let (_ir, needs_redraw) = self.composer.handle_key_event(key_event);
+            if needs_redraw {
+                self.request_redraw();
+            }
+            return InputResult::None;
+        }
+
        // If a modal/view is active, handle it here; otherwise forward to composer.
        if !self.view_stack.is_empty() {
+            if key_event.kind == KeyEventKind::Release {
+                return InputResult::None;
+            }
+
            // We need three pieces of information after routing the key:
            // whether Esc completed the view, whether the view finished for any
            // reason, and whether a paste-burst timer should be scheduled.
@ -432,6 +453,7 @@ impl BottomPane {
            }
        } else {
            let needs_redraw = self.composer.handle_paste(pasted);
+            self.composer.sync_popups();
            if needs_redraw {
                self.request_redraw();
            }
@ -440,9 +462,18 @@ impl BottomPane {

    pub(crate) fn insert_str(&mut self, text: &str) {
        self.composer.insert_str(text);
+        self.composer.sync_popups();
        self.request_redraw();
    }

+    // Space hold timeout is handled inside ChatComposer via an internal timer.
+    pub(crate) fn pre_draw_tick(&mut self) {
+        // Allow composer to process any time-based transitions before drawing
+        #[cfg(not(target_os = "linux"))]
+        self.composer.process_space_hold_trigger();
+        self.composer.sync_popups();
+    }
+
    /// Replace the composer text with `text`.
    ///
    /// This is intended for fresh input where mention linkage does not need to
@ -895,6 +926,7 @@ impl BottomPane {
            .on_history_entry_response(log_id, offset, entry);

        if updated {
+            self.composer.sync_popups();
            self.request_redraw();
        }
    }
@ -973,6 +1005,30 @@ impl BottomPane {
    }
 }

+#[cfg(not(target_os = "linux"))]
+impl BottomPane {
+    pub(crate) fn replace_transcription(&mut self, id: &str, text: &str) {
+        self.composer.replace_transcription(id, text);
+        self.composer.sync_popups();
+        self.request_redraw();
+    }
+
+    pub(crate) fn update_transcription_in_place(&mut self, id: &str, text: &str) -> bool {
+        let updated = self.composer.update_transcription_in_place(id, text);
+        if updated {
+            self.composer.sync_popups();
+            self.request_redraw();
+        }
+        updated
+    }
+
+    pub(crate) fn remove_transcription_placeholder(&mut self, id: &str) {
+        self.composer.remove_transcription_placeholder(id);
+        self.composer.sync_popups();
+        self.request_redraw();
+    }
+}
+
 impl Renderable for BottomPane {
    fn render(&self, area: Rect, buf: &mut Buffer) {
        self.as_renderable().render(area, buf);
@ -993,6 +1049,7 @@ mod tests {
    use crate::status_indicator_widget::StatusDetailsCapitalization;
    use codex_protocol::protocol::Op;
    use codex_protocol::protocol::SkillScope;
+    use crossterm::event::KeyEventKind;
    use crossterm::event::KeyModifiers;
    use insta::assert_snapshot;
    use ratatui::buffer::Buffer;
@ -1571,4 +1628,58 @@ mod tests {
        assert_eq!(on_ctrl_c_calls.get(), 0);
        assert_eq!(handle_calls.get(), 1);
    }
+
+    #[test]
+    fn release_events_are_ignored_for_active_view() {
+        #[derive(Default)]
+        struct CountingView {
+            handle_calls: Rc<Cell<usize>>,
+        }
+
+        impl Renderable for CountingView {
+            fn render(&self, _area: Rect, _buf: &mut Buffer) {}
+
+            fn desired_height(&self, _width: u16) -> u16 {
+                0
+            }
+        }
+
+        impl BottomPaneView for CountingView {
+            fn handle_key_event(&mut self, _key_event: KeyEvent) {
+                self.handle_calls
+                    .set(self.handle_calls.get().saturating_add(1));
+            }
+        }
+
+        let (tx_raw, _rx) = unbounded_channel::<AppEvent>();
+        let tx = AppEventSender::new(tx_raw);
+        let mut pane = BottomPane::new(BottomPaneParams {
+            app_event_tx: tx,
+            frame_requester: FrameRequester::test_dummy(),
+            has_input_focus: true,
+            enhanced_keys_supported: false,
+            placeholder_text: "Ask Codex to do anything".to_string(),
+            disable_paste_burst: false,
+            animations_enabled: true,
+            skills: Some(Vec::new()),
+        });
+
+        let handle_calls = Rc::new(Cell::new(0));
+        pane.push_view(Box::new(CountingView {
+            handle_calls: Rc::clone(&handle_calls),
+        }));
+
+        pane.handle_key_event(KeyEvent::new_with_kind(
+            KeyCode::Down,
+            KeyModifiers::NONE,
+            KeyEventKind::Press,
+        ));
+        pane.handle_key_event(KeyEvent::new_with_kind(
+            KeyCode::Down,
+            KeyModifiers::NONE,
+            KeyEventKind::Release,
+        ));
+
+        assert_eq!(handle_calls.get(), 1);
+    }
 }
--- a/codex-rs/tui/src/bottom_pane/textarea.rs
+++ b/codex-rs/tui/src/bottom_pane/textarea.rs
@ -3,6 +3,7 @@ use codex_protocol::user_input::ByteRange;
 use codex_protocol::user_input::TextElement as UserTextElement;
 use crossterm::event::KeyCode;
 use crossterm::event::KeyEvent;
+use crossterm::event::KeyEventKind;
 use crossterm::event::KeyModifiers;
 use ratatui::buffer::Buffer;
 use ratatui::layout::Rect;
@ -27,6 +28,7 @@ fn is_word_separator(ch: char) -> bool {
 struct TextElement {
    id: u64,
    range: Range<usize>,
+    name: Option<String>,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@ -101,6 +103,7 @@ impl TextArea {
                self.elements.push(TextElement {
                    id,
                    range: start..end,
+                    name: None,
                });
            }
            self.elements.sort_by_key(|e| e.range.start);
@ -256,6 +259,11 @@ impl TextArea {
    }

    pub fn input(&mut self, event: KeyEvent) {
+        // Only process key presses or repeats; ignore releases to avoid inserting
+        // characters on key-up events when modifiers are no longer reported.
+        if !matches!(event.kind, KeyEventKind::Press | KeyEventKind::Repeat) {
+            return;
+        }
        match event {
            // Some terminals (or configurations) send Control key chords as
            // C0 control characters without reporting the CONTROL modifier.
@ -886,6 +894,73 @@ impl TextArea {
        id
    }

+    #[cfg(not(target_os = "linux"))]
+    pub fn insert_named_element(&mut self, text: &str, id: String) {
+        let start = self.clamp_pos_for_insertion(self.cursor_pos);
+        self.insert_str_at(start, text);
+        let end = start + text.len();
+        self.add_element_with_id(start..end, Some(id));
+        // Place cursor at end of inserted element
+        self.set_cursor(end);
+    }
+
+    pub fn replace_element_by_id(&mut self, id: &str, text: &str) -> bool {
+        if let Some(idx) = self
+            .elements
+            .iter()
+            .position(|e| e.name.as_deref() == Some(id))
+        {
+            let range = self.elements[idx].range.clone();
+            self.replace_range_raw(range, text);
+            self.elements.retain(|e| e.name.as_deref() != Some(id));
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Update the element's text in place, preserving its id so callers can
+    /// update it again later (e.g. recording -> transcribing -> final).
+    #[allow(dead_code)]
+    pub fn update_named_element_by_id(&mut self, id: &str, text: &str) -> bool {
+        if let Some(elem_idx) = self
+            .elements
+            .iter()
+            .position(|e| e.name.as_deref() == Some(id))
+        {
+            let old_range = self.elements[elem_idx].range.clone();
+            let start = old_range.start;
+            self.replace_range_raw(old_range, text);
+            // After replace_range_raw, the old element entry was removed if fully overlapped.
+            // Re-add an updated element with the same id and new range.
+            let new_end = start + text.len();
+            self.add_element_with_id(start..new_end, Some(id.to_string()));
+            true
+        } else {
+            false
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn named_element_range(&self, id: &str) -> Option<std::ops::Range<usize>> {
+        self.elements
+            .iter()
+            .find(|e| e.name.as_deref() == Some(id))
+            .map(|e| e.range.clone())
+    }
+
+    fn add_element_with_id(&mut self, range: Range<usize>, name: Option<String>) -> u64 {
+        let id = self.next_element_id();
+        let elem = TextElement { id, range, name };
+        self.elements.push(elem);
+        self.elements.sort_by_key(|e| e.range.start);
+        id
+    }
+
+    fn add_element(&mut self, range: Range<usize>) -> u64 {
+        self.add_element_with_id(range, None)
+    }
+
    /// Mark an existing text range as an atomic element without changing the text.
    ///
    /// This is used to convert already-typed tokens (like `/plan`) into elements
@ -910,12 +985,7 @@ impl TextArea {
        {
            return None;
        }
-        let id = self.next_element_id();
-        self.elements.push(TextElement {
-            id,
-            range: start..end,
-        });
-        self.elements.sort_by_key(|e| e.range.start);
+        let id = self.add_element(start..end);
        Some(id)
    }

@ -931,20 +1001,11 @@ impl TextArea {
        len_before != self.elements.len()
    }

-    fn add_element(&mut self, range: Range<usize>) -> u64 {
-        let id = self.next_element_id();
-        let elem = TextElement { id, range };
-        self.elements.push(elem);
-        self.elements.sort_by_key(|e| e.range.start);
-        id
-    }
-
    fn next_element_id(&mut self) -> u64 {
        let id = self.next_element_id;
        self.next_element_id = self.next_element_id.saturating_add(1);
        id
    }
-
    fn find_element_containing(&self, pos: usize) -> Option<usize> {
        self.elements
            .iter()
--- a/codex-rs/tui/src/chatwidget.rs
+++ b/codex-rs/tui/src/chatwidget.rs
@ -2259,6 +2259,10 @@ impl ChatWidget {
        );
    }

+    pub(crate) fn pre_draw_tick(&mut self) {
+        self.bottom_pane.pre_draw_tick();
+    }
+
    /// Handle completion of an `AgentMessage` turn item.
    ///
    /// Commentary completion sets a deferred restore flag so the status row
@ -2848,6 +2852,9 @@ impl ChatWidget {
        widget
            .bottom_pane
            .set_steer_enabled(widget.config.features.enabled(Feature::Steer));
+        widget.bottom_pane.set_voice_transcription_enabled(
+            widget.config.features.enabled(Feature::VoiceTranscription),
+        );
        widget
            .bottom_pane
            .set_status_line_enabled(!widget.configured_status_line_items().is_empty());
@ -3016,6 +3023,9 @@ impl ChatWidget {
        widget
            .bottom_pane
            .set_steer_enabled(widget.config.features.enabled(Feature::Steer));
+        widget.bottom_pane.set_voice_transcription_enabled(
+            widget.config.features.enabled(Feature::VoiceTranscription),
+        );
        widget
            .bottom_pane
            .set_status_line_enabled(!widget.configured_status_line_items().is_empty());
@ -3173,6 +3183,9 @@ impl ChatWidget {
        widget
            .bottom_pane
            .set_steer_enabled(widget.config.features.enabled(Feature::Steer));
+        widget.bottom_pane.set_voice_transcription_enabled(
+            widget.config.features.enabled(Feature::VoiceTranscription),
+        );
        widget
            .bottom_pane
            .set_status_line_enabled(!widget.configured_status_line_items().is_empty());
@ -6370,6 +6383,9 @@ impl ChatWidget {
        if feature == Feature::Steer {
            self.bottom_pane.set_steer_enabled(enabled);
        }
+        if feature == Feature::VoiceTranscription {
+            self.bottom_pane.set_voice_transcription_enabled(enabled);
+        }
        if feature == Feature::Personality {
            self.sync_personality_command_enabled();
        }
@ -7521,6 +7537,29 @@ impl ChatWidget {
    }
 }

+#[cfg(not(target_os = "linux"))]
+impl ChatWidget {
+    pub(crate) fn replace_transcription(&mut self, id: &str, text: &str) {
+        self.bottom_pane.replace_transcription(id, text);
+        // Ensure the UI redraws to reflect the updated transcription.
+        self.request_redraw();
+    }
+
+    pub(crate) fn update_transcription_in_place(&mut self, id: &str, text: &str) -> bool {
+        let updated = self.bottom_pane.update_transcription_in_place(id, text);
+        if updated {
+            self.request_redraw();
+        }
+        updated
+    }
+
+    pub(crate) fn remove_transcription_placeholder(&mut self, id: &str) {
+        self.bottom_pane.remove_transcription_placeholder(id);
+        // Ensure the UI redraws to reflect placeholder removal.
+        self.request_redraw();
+    }
+}
+
 fn has_websocket_timing_metrics(summary: RuntimeMetricsSummary) -> bool {
    summary.responses_api_overhead_ms > 0
        || summary.responses_api_inference_time_ms > 0
--- a/codex-rs/tui/src/lib.rs
+++ b/codex-rs/tui/src/lib.rs
@ -114,7 +114,79 @@ pub mod update_action;
 mod update_prompt;
 mod updates;
 mod version;
+#[cfg(all(not(target_os = "linux"), feature = "voice-input"))]
+mod voice;
+#[cfg(all(not(target_os = "linux"), not(feature = "voice-input")))]
+mod voice {
+    use crate::app_event::AppEvent;
+    use crate::app_event_sender::AppEventSender;
+    use std::sync::Arc;
+    use std::sync::Mutex;
+    use std::sync::atomic::AtomicBool;
+    use std::sync::atomic::AtomicU16;

+    pub struct RecordedAudio {
+        pub data: Vec<i16>,
+        pub sample_rate: u32,
+        pub channels: u16,
+    }
+
+    pub struct VoiceCapture;
+
+    pub(crate) struct RecordingMeterState;
+
+    impl VoiceCapture {
+        pub fn start() -> Result<Self, String> {
+            Err("voice input is unavailable in this build".to_string())
+        }
+
+        pub fn stop(self) -> Result<RecordedAudio, String> {
+            Err("voice input is unavailable in this build".to_string())
+        }
+
+        pub fn data_arc(&self) -> Arc<Mutex<Vec<i16>>> {
+            Arc::new(Mutex::new(Vec::new()))
+        }
+
+        pub fn stopped_flag(&self) -> Arc<AtomicBool> {
+            Arc::new(AtomicBool::new(true))
+        }
+
+        pub fn sample_rate(&self) -> u32 {
+            0
+        }
+
+        pub fn channels(&self) -> u16 {
+            0
+        }
+
+        pub fn last_peak_arc(&self) -> Arc<AtomicU16> {
+            Arc::new(AtomicU16::new(0))
+        }
+    }
+
+    impl RecordingMeterState {
+        pub(crate) fn new() -> Self {
+            Self
+        }
+
+        pub(crate) fn next_text(&mut self, _peak: u16) -> String {
+            "⠤⠤⠤⠤".to_string()
+        }
+    }
+
+    pub fn transcribe_async(
+        id: String,
+        _audio: RecordedAudio,
+        _context: Option<String>,
+        tx: AppEventSender,
+    ) {
+        tx.send(AppEvent::TranscriptionFailed {
+            id,
+            error: "voice input is unavailable in this build".to_string(),
+        });
+    }
+}
 mod wrapping;

 #[cfg(test)]
--- a/codex-rs/tui/src/voice.rs
+++ b/codex-rs/tui/src/voice.rs
@ -0,0 +1,517 @@
+use crate::app_event::AppEvent;
+use crate::app_event_sender::AppEventSender;
+use codex_core::auth::AuthCredentialsStoreMode;
+use codex_core::config::Config;
+use codex_core::config::find_codex_home;
+use codex_core::default_client::get_codex_user_agent;
+use codex_login::AuthMode;
+use codex_login::CodexAuth;
+use cpal::traits::DeviceTrait;
+use cpal::traits::HostTrait;
+use cpal::traits::StreamTrait;
+use hound::SampleFormat;
+use hound::WavSpec;
+use hound::WavWriter;
+use std::collections::VecDeque;
+use std::io::Cursor;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::AtomicU16;
+use std::sync::atomic::Ordering;
+use tracing::error;
+use tracing::info;
+use tracing::trace;
+
+struct TranscriptionAuthContext {
+    mode: AuthMode,
+    bearer_token: String,
+    chatgpt_account_id: Option<String>,
+    chatgpt_base_url: String,
+}
+
+pub struct RecordedAudio {
+    pub data: Vec<i16>,
+    pub sample_rate: u32,
+    pub channels: u16,
+}
+
+pub struct VoiceCapture {
+    stream: Option<cpal::Stream>,
+    sample_rate: u32,
+    channels: u16,
+    data: Arc<Mutex<Vec<i16>>>,
+    stopped: Arc<AtomicBool>,
+    last_peak: Arc<AtomicU16>,
+}
+
+impl VoiceCapture {
+    pub fn start() -> Result<Self, String> {
+        let (device, config) = select_input_device_and_config()?;
+
+        let sample_rate = config.sample_rate().0;
+        let channels = config.channels();
+        let data: Arc<Mutex<Vec<i16>>> = Arc::new(Mutex::new(Vec::new()));
+        let stopped = Arc::new(AtomicBool::new(false));
+        let last_peak = Arc::new(AtomicU16::new(0));
+
+        let stream = build_input_stream(&device, &config, data.clone(), last_peak.clone())?;
+        stream
+            .play()
+            .map_err(|e| format!("failed to start input stream: {e}"))?;
+
+        Ok(Self {
+            stream: Some(stream),
+            sample_rate,
+            channels,
+            data,
+            stopped,
+            last_peak,
+        })
+    }
+
+    pub fn stop(mut self) -> Result<RecordedAudio, String> {
+        // Mark stopped so any metering task can exit cleanly.
+        self.stopped.store(true, Ordering::SeqCst);
+        // Dropping the stream stops capture.
+        self.stream.take();
+        let data = self
+            .data
+            .lock()
+            .map_err(|_| "failed to lock audio buffer".to_string())?
+            .clone();
+        Ok(RecordedAudio {
+            data,
+            sample_rate: self.sample_rate,
+            channels: self.channels,
+        })
+    }
+
+    pub fn data_arc(&self) -> Arc<Mutex<Vec<i16>>> {
+        self.data.clone()
+    }
+
+    pub fn stopped_flag(&self) -> Arc<AtomicBool> {
+        self.stopped.clone()
+    }
+
+    pub fn sample_rate(&self) -> u32 {
+        self.sample_rate
+    }
+
+    pub fn channels(&self) -> u16 {
+        self.channels
+    }
+
+    pub fn last_peak_arc(&self) -> Arc<AtomicU16> {
+        self.last_peak.clone()
+    }
+}
+
+pub(crate) struct RecordingMeterState {
+    history: VecDeque<char>,
+    noise_ema: f64,
+    env: f64,
+}
+
+impl RecordingMeterState {
+    pub(crate) fn new() -> Self {
+        let mut history = VecDeque::with_capacity(4);
+        while history.len() < 4 {
+            history.push_back('⠤');
+        }
+        Self {
+            history,
+            noise_ema: 0.02,
+            env: 0.0,
+        }
+    }
+
+    pub(crate) fn next_text(&mut self, peak: u16) -> String {
+        const SYMBOLS: [char; 7] = ['⠤', '⠴', '⠶', '⠷', '⡷', '⡿', '⣿'];
+        const ALPHA_NOISE: f64 = 0.05;
+        const ATTACK: f64 = 0.80;
+        const RELEASE: f64 = 0.25;
+
+        let latest_peak = peak as f64 / (i16::MAX as f64);
+
+        if latest_peak > self.env {
+            self.env = ATTACK * latest_peak + (1.0 - ATTACK) * self.env;
+        } else {
+            self.env = RELEASE * latest_peak + (1.0 - RELEASE) * self.env;
+        }
+
+        let rms_approx = self.env * 0.7;
+        self.noise_ema = (1.0 - ALPHA_NOISE) * self.noise_ema + ALPHA_NOISE * rms_approx;
+        let ref_level = self.noise_ema.max(0.01);
+        let fast_signal = 0.8 * latest_peak + 0.2 * self.env;
+        let target = 2.0f64;
+        let raw = (fast_signal / (ref_level * target)).max(0.0);
+        let k = 1.6f64;
+        let compressed = (raw.ln_1p() / k.ln_1p()).min(1.0);
+        let idx = (compressed * (SYMBOLS.len() as f64 - 1.0))
+            .round()
+            .clamp(0.0, SYMBOLS.len() as f64 - 1.0) as usize;
+        let level_char = SYMBOLS[idx];
+
+        if self.history.len() >= 4 {
+            self.history.pop_front();
+        }
+        self.history.push_back(level_char);
+
+        let mut text = String::with_capacity(4);
+        for ch in &self.history {
+            text.push(*ch);
+        }
+        text
+    }
+}
+
+pub fn transcribe_async(
+    id: String,
+    audio: RecordedAudio,
+    context: Option<String>,
+    tx: AppEventSender,
+) {
+    std::thread::spawn(move || {
+        // Enforce minimum duration to avoid garbage outputs.
+        const MIN_DURATION_SECONDS: f32 = 1.0;
+        let duration_seconds = clip_duration_seconds(&audio);
+        if duration_seconds < MIN_DURATION_SECONDS {
+            let msg = format!(
+                "recording too short ({duration_seconds:.2}s); minimum is {MIN_DURATION_SECONDS:.2}s"
+            );
+            info!("{msg}");
+            tx.send(AppEvent::TranscriptionFailed { id, error: msg });
+            return;
+        }
+
+        // Encode entire clip as normalized WAV.
+        let wav_bytes = match encode_wav_normalized(&audio) {
+            Ok(b) => b,
+            Err(e) => {
+                error!("failed to encode wav: {e}");
+                tx.send(AppEvent::TranscriptionFailed { id, error: e });
+                return;
+            }
+        };
+
+        // Run the HTTP request on a small, dedicated runtime.
+        let rt = match tokio::runtime::Runtime::new() {
+            Ok(rt) => rt,
+            Err(e) => {
+                error!("failed to create tokio runtime: {e}");
+                return;
+            }
+        };
+
+        let tx2 = tx.clone();
+        let id2 = id.clone();
+        let res: Result<String, String> = rt
+            .block_on(async move { transcribe_bytes(wav_bytes, context, duration_seconds).await });
+
+        match res {
+            Ok(text) => {
+                tx2.send(AppEvent::TranscriptionComplete { id: id2, text });
+                info!("voice transcription succeeded");
+            }
+            Err(e) => {
+                error!("voice transcription error: {e}");
+                tx.send(AppEvent::TranscriptionFailed { id, error: e });
+            }
+        }
+    });
+}
+
+// -------------------------
+// Voice input helpers
+// -------------------------
+
+fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+    let host = cpal::default_host();
+    let device = host
+        .default_input_device()
+        .ok_or_else(|| "no input audio device available".to_string())?;
+    let config = device
+        .default_input_config()
+        .map_err(|e| format!("failed to get default input config: {e}"))?;
+    Ok((device, config))
+}
+
+fn build_input_stream(
+    device: &cpal::Device,
+    config: &cpal::SupportedStreamConfig,
+    data: Arc<Mutex<Vec<i16>>>,
+    last_peak: Arc<AtomicU16>,
+) -> Result<cpal::Stream, String> {
+    match config.sample_format() {
+        cpal::SampleFormat::F32 => device
+            .build_input_stream(
+                &config.clone().into(),
+                move |input: &[f32], _| {
+                    let peak = peak_f32(input);
+                    last_peak.store(peak, Ordering::Relaxed);
+                    if let Ok(mut buf) = data.lock() {
+                        for &s in input {
+                            buf.push(f32_to_i16(s));
+                        }
+                    }
+                },
+                move |err| error!("audio input error: {err}"),
+                None,
+            )
+            .map_err(|e| format!("failed to build input stream: {e}")),
+        cpal::SampleFormat::I16 => device
+            .build_input_stream(
+                &config.clone().into(),
+                move |input: &[i16], _| {
+                    let peak = peak_i16(input);
+                    last_peak.store(peak, Ordering::Relaxed);
+                    if let Ok(mut buf) = data.lock() {
+                        buf.extend_from_slice(input);
+                    }
+                },
+                move |err| error!("audio input error: {err}"),
+                None,
+            )
+            .map_err(|e| format!("failed to build input stream: {e}")),
+        cpal::SampleFormat::U16 => device
+            .build_input_stream(
+                &config.clone().into(),
+                move |input: &[u16], _| {
+                    if let Ok(mut buf) = data.lock() {
+                        let peak = convert_u16_to_i16_and_peak(input, &mut buf);
+                        last_peak.store(peak, Ordering::Relaxed);
+                    }
+                },
+                move |err| error!("audio input error: {err}"),
+                None,
+            )
+            .map_err(|e| format!("failed to build input stream: {e}")),
+        _ => Err("unsupported input sample format".to_string()),
+    }
+}
+
+#[inline]
+fn f32_abs_to_u16(x: f32) -> u16 {
+    let peak_u = (x.abs().min(1.0) * i16::MAX as f32) as i32;
+    peak_u.max(0) as u16
+}
+
+#[inline]
+fn f32_to_i16(s: f32) -> i16 {
+    (s.clamp(-1.0, 1.0) * i16::MAX as f32) as i16
+}
+
+fn peak_f32(input: &[f32]) -> u16 {
+    let mut peak: f32 = 0.0;
+    for &s in input {
+        let a = s.abs();
+        if a > peak {
+            peak = a;
+        }
+    }
+    f32_abs_to_u16(peak)
+}
+
+fn peak_i16(input: &[i16]) -> u16 {
+    let mut peak: i32 = 0;
+    for &s in input {
+        let a = (s as i32).unsigned_abs() as i32;
+        if a > peak {
+            peak = a;
+        }
+    }
+    peak as u16
+}
+
+fn convert_u16_to_i16_and_peak(input: &[u16], out: &mut Vec<i16>) -> u16 {
+    let mut peak: i32 = 0;
+    for &s in input {
+        let v_i16 = (s as i32 - 32768) as i16;
+        let a = (v_i16 as i32).unsigned_abs() as i32;
+        if a > peak {
+            peak = a;
+        }
+        out.push(v_i16);
+    }
+    peak as u16
+}
+
+// -------------------------
+// Transcription helpers
+// -------------------------
+
+fn clip_duration_seconds(audio: &RecordedAudio) -> f32 {
+    let total_samples = audio.data.len() as f32;
+    let samples_per_second = (audio.sample_rate as f32) * (audio.channels as f32);
+    if samples_per_second > 0.0 {
+        total_samples / samples_per_second
+    } else {
+        0.0
+    }
+}
+
+fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
+    let mut wav_bytes: Vec<u8> = Vec::new();
+    let spec = WavSpec {
+        channels: audio.channels,
+        sample_rate: audio.sample_rate,
+        bits_per_sample: 16,
+        sample_format: SampleFormat::Int,
+    };
+    let mut cursor = Cursor::new(&mut wav_bytes);
+    let mut writer =
+        WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?;
+
+    // Simple peak normalization with headroom to improve audibility on quiet inputs.
+    let segment = &audio.data[..];
+    let mut peak: i16 = 0;
+    for &s in segment {
+        let a = s.unsigned_abs();
+        if a > peak.unsigned_abs() {
+            peak = s;
+        }
+    }
+    let peak_abs = (peak as i32).unsigned_abs() as i32;
+    let target = (i16::MAX as f32) * 0.9; // leave some headroom
+    let gain: f32 = if peak_abs > 0 {
+        target / (peak_abs as f32)
+    } else {
+        1.0
+    };
+
+    for &s in segment {
+        let v = ((s as f32) * gain)
+            .round()
+            .clamp(i16::MIN as f32, i16::MAX as f32) as i16;
+        writer
+            .write_sample(v)
+            .map_err(|_| "failed writing wav sample".to_string())?;
+    }
+    writer
+        .finalize()
+        .map_err(|_| "failed to finalize wav".to_string())?;
+    Ok(wav_bytes)
+}
+
+fn normalize_chatgpt_base_url(input: &str) -> String {
+    let mut base_url = input.to_string();
+    while base_url.ends_with('/') {
+        base_url.pop();
+    }
+    if (base_url.starts_with("https://chatgpt.com")
+        || base_url.starts_with("https://chat.openai.com"))
+        && !base_url.contains("/backend-api")
+    {
+        base_url = format!("{base_url}/backend-api");
+    }
+    base_url
+}
+
+async fn resolve_auth() -> Result<TranscriptionAuthContext, String> {
+    let codex_home = find_codex_home().map_err(|e| format!("failed to find codex home: {e}"))?;
+    let auth = CodexAuth::from_auth_storage(&codex_home, AuthCredentialsStoreMode::Auto)
+        .map_err(|e| format!("failed to read auth.json: {e}"))?
+        .ok_or_else(|| "No Codex auth is configured; please run `codex login`".to_string())?;
+
+    let chatgpt_account_id = auth.get_account_id();
+
+    let token = auth
+        .get_token()
+        .map_err(|e| format!("failed to get auth token: {e}"))?;
+    let config = Config::load_with_cli_overrides(Vec::new())
+        .await
+        .map_err(|e| format!("failed to load config: {e}"))?;
+    Ok(TranscriptionAuthContext {
+        mode: auth.api_auth_mode(),
+        bearer_token: token,
+        chatgpt_account_id,
+        chatgpt_base_url: normalize_chatgpt_base_url(&config.chatgpt_base_url),
+    })
+}
+
+async fn transcribe_bytes(
+    wav_bytes: Vec<u8>,
+    context: Option<String>,
+    duration_seconds: f32,
+) -> Result<String, String> {
+    let auth = resolve_auth().await?;
+    let client = reqwest::Client::new();
+    let audio_bytes = wav_bytes.len();
+    let prompt_for_log = context.as_deref().unwrap_or("").to_string();
+    let (endpoint, request) =
+        if matches!(auth.mode, AuthMode::Chatgpt | AuthMode::ChatgptAuthTokens) {
+            let part = reqwest::multipart::Part::bytes(wav_bytes)
+                .file_name("audio.wav")
+                .mime_str("audio/wav")
+                .map_err(|e| format!("failed to set mime: {e}"))?;
+            let form = reqwest::multipart::Form::new().part("file", part);
+            let endpoint = format!("{}/transcribe", auth.chatgpt_base_url);
+            let mut req = client
+                .post(&endpoint)
+                .bearer_auth(&auth.bearer_token)
+                .multipart(form)
+                .header("User-Agent", get_codex_user_agent());
+            if let Some(acc) = auth.chatgpt_account_id {
+                req = req.header("ChatGPT-Account-Id", acc);
+            }
+            (endpoint, req)
+        } else {
+            let part = reqwest::multipart::Part::bytes(wav_bytes)
+                .file_name("audio.wav")
+                .mime_str("audio/wav")
+                .map_err(|e| format!("failed to set mime: {e}"))?;
+            let mut form = reqwest::multipart::Form::new()
+                .text("model", "gpt-4o-transcribe")
+                .part("file", part);
+            if let Some(context) = context {
+                form = form.text("prompt", context);
+            }
+            let endpoint = "https://api.openai.com/v1/audio/transcriptions".to_string();
+            (
+                endpoint,
+                client
+                    .post("https://api.openai.com/v1/audio/transcriptions")
+                    .bearer_auth(&auth.bearer_token)
+                    .multipart(form)
+                    .header("User-Agent", get_codex_user_agent()),
+            )
+        };
+
+    let audio_kib = audio_bytes as f32 / 1024.0;
+    let mode = auth.mode;
+    trace!(
+        "sending transcription request: mode={mode:?} endpoint={endpoint} duration={duration_seconds:.2}s audio={audio_kib:.1}KiB prompt={prompt_for_log}"
+    );
+
+    let resp = request
+        .send()
+        .await
+        .map_err(|e| format!("transcription request failed: {e}"))?;
+
+    if !resp.status().is_success() {
+        let status = resp.status();
+        let body = resp
+            .text()
+            .await
+            .unwrap_or_else(|_| "<failed to read body>".to_string());
+        return Err(format!("transcription failed: {status} {body}"));
+    }
+
+    let v: serde_json::Value = resp
+        .json()
+        .await
+        .map_err(|e| format!("failed to parse json: {e}"))?;
+    let text = v
+        .get("text")
+        .and_then(|t| t.as_str())
+        .unwrap_or("")
+        .to_string();
+
+    if text.is_empty() {
+        Err("empty transcription result".to_string())
+    } else {
+        Ok(text)
+    }
+}
--- a/patches/toolchains_llvm_bootstrapped_resource_dir.patch
+++ b/patches/toolchains_llvm_bootstrapped_resource_dir.patch
@ -71,15 +71,3 @@ index 6ffc9f7..e02089a 100644
         ],
     }),
 )
-diff --git a/toolchain/llvm/llvm.bzl b/toolchain/llvm/llvm.bzl
-index d068085..c152552 100644
--- a/toolchain/llvm/llvm.bzl
-+++ b/toolchain/llvm/llvm.bzl
-@@ -7,6 +7,7 @@ def declare_llvm_targets(*, suffix = ""):
-         name = "builtin_headers",
-         # Grab whichever version-specific dir is there.
-         path = native.glob(["lib/clang/*"], exclude_directories = 0)[0] + "/include",
-+        visibility = ["//visibility:public"],
-     )
- 
-     # Convenient exports