From a0e86c69fe35e8ccfbed7ed87f07914d467ec488 Mon Sep 17 00:00:00 2001
From: Ahmed Ibrahim <aibrahim@openai.com>
Date: Thu, 26 Feb 2026 15:08:21 -0800
Subject: [PATCH] Add realtime audio device config (#12849)

## Summary
- add top-level realtime audio config for microphone and speaker
selection
- apply configured devices when starting realtime capture and playback
- keep missing-device behavior on the system default fallback path

## Validation
- just write-config-schema
- cargo test -p codex-core realtime_audio
- cargo test -p codex-tui
- just fix -p codex-core
- just fix -p codex-tui
- just fmt

---------

Co-authored-by: Codex <noreply@openai.com>
---
 codex-rs/core/config.schema.json        |  21 ++++
 codex-rs/core/src/config/edit.rs        |  68 +++++++++++++
 codex-rs/core/src/config/mod.rs         |  63 ++++++++++++
 codex-rs/tui/src/audio_device.rs        | 122 ++++++++++++++++++++++++
 codex-rs/tui/src/chatwidget/realtime.rs |   9 +-
 codex-rs/tui/src/lib.rs                 |   7 +-
 codex-rs/tui/src/voice.rs               |  26 ++---
 7 files changed, 299 insertions(+), 17 deletions(-)
 create mode 100644 codex-rs/tui/src/audio_device.rs

diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json
index b9986e65f..01ac97ef2 100644
--- a/codex-rs/core/config.schema.json
+++ b/codex-rs/core/config.schema.json
@@ -1179,6 +1179,18 @@
       },
       "type": "object"
     },
+    "RealtimeAudioToml": {
+      "additionalProperties": false,
+      "properties": {
+        "microphone": {
+          "type": "string"
+        },
+        "speaker": {
+          "type": "string"
+        }
+      },
+      "type": "object"
+    },
     "ReasoningEffort": {
       "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning",
       "enum": [
@@ -1540,6 +1552,15 @@
       "default": null,
       "description": "Settings for app-specific controls."
     },
+    "audio": {
+      "allOf": [
+        {
+          "$ref": "#/definitions/RealtimeAudioToml"
+        }
+      ],
+      "default": null,
+      "description": "Machine-local realtime audio device preferences used by realtime voice."
+    },
     "background_terminal_max_timeout": {
       "description": "Maximum poll window for background terminal output (`write_stdin`), in milliseconds. Default: `300000` (5 minutes).",
       "format": "uint64",
diff --git a/codex-rs/core/src/config/edit.rs b/codex-rs/core/src/config/edit.rs
index fceb96599..ad404a9f8 100644
--- a/codex-rs/core/src/config/edit.rs
+++ b/codex-rs/core/src/config/edit.rs
@@ -839,6 +839,30 @@ impl ConfigEditsBuilder {
         self
     }
 
+    pub fn set_realtime_microphone(mut self, microphone: Option<&str>) -> Self {
+        let segments = vec!["audio".to_string(), "microphone".to_string()];
+        match microphone {
+            Some(microphone) => self.edits.push(ConfigEdit::SetPath {
+                segments,
+                value: value(microphone),
+            }),
+            None => self.edits.push(ConfigEdit::ClearPath { segments }),
+        }
+        self
+    }
+
+    pub fn set_realtime_speaker(mut self, speaker: Option<&str>) -> Self {
+        let segments = vec!["audio".to_string(), "speaker".to_string()];
+        match speaker {
+            Some(speaker) => self.edits.push(ConfigEdit::SetPath {
+                segments,
+                value: value(speaker),
+            }),
+            None => self.edits.push(ConfigEdit::ClearPath { segments }),
+        }
+        self
+    }
+
     pub fn clear_legacy_windows_sandbox_keys(mut self) -> Self {
         for key in [
             "experimental_windows_sandbox",
@@ -1804,6 +1828,50 @@ model_reasoning_effort = "high"
         assert_eq!(notice, Some(true));
     }
 
+    #[test]
+    fn blocking_builder_set_realtime_audio_persists_and_clears() {
+        let tmp = tempdir().expect("tmpdir");
+        let codex_home = tmp.path();
+
+        ConfigEditsBuilder::new(codex_home)
+            .set_realtime_microphone(Some("USB Mic"))
+            .set_realtime_speaker(Some("Desk Speakers"))
+            .apply_blocking()
+            .expect("persist realtime audio");
+
+        let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config");
+        let config: TomlValue = toml::from_str(&raw).expect("parse config");
+        let realtime_audio = config
+            .get("audio")
+            .and_then(TomlValue::as_table)
+            .expect("audio table should exist");
+        assert_eq!(
+            realtime_audio.get("microphone").and_then(TomlValue::as_str),
+            Some("USB Mic")
+        );
+        assert_eq!(
+            realtime_audio.get("speaker").and_then(TomlValue::as_str),
+            Some("Desk Speakers")
+        );
+
+        ConfigEditsBuilder::new(codex_home)
+            .set_realtime_microphone(None)
+            .apply_blocking()
+            .expect("clear realtime microphone");
+
+        let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config");
+        let config: TomlValue = toml::from_str(&raw).expect("parse config");
+        let realtime_audio = config
+            .get("audio")
+            .and_then(TomlValue::as_table)
+            .expect("audio table should exist");
+        assert_eq!(realtime_audio.get("microphone"), None);
+        assert_eq!(
+            realtime_audio.get("speaker").and_then(TomlValue::as_str),
+            Some("Desk Speakers")
+        );
+    }
+
     #[test]
     fn replace_mcp_servers_blocking_clears_table_when_empty() {
         let tmp = tempdir().expect("tmpdir");
diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs
index c5b22c829..e9da18b5c 100644
--- a/codex-rs/core/src/config/mod.rs
+++ b/codex-rs/core/src/config/mod.rs
@@ -426,6 +426,9 @@ pub struct Config {
     /// Base URL for requests to ChatGPT (as opposed to the OpenAI API).
     pub chatgpt_base_url: String,
 
+    /// Machine-local realtime audio device preferences used by realtime voice.
+    pub realtime_audio: RealtimeAudioConfig,
+
     /// Experimental / do not use. Overrides only the realtime conversation
     /// websocket transport base URL (the `Op::RealtimeConversation` `/ws`
     /// connection) without changing normal provider HTTP requests.
@@ -1175,6 +1178,10 @@ pub struct ConfigToml {
     /// Base URL for requests to ChatGPT (as opposed to the OpenAI API).
     pub chatgpt_base_url: Option<String>,
 
+    /// Machine-local realtime audio device preferences used by realtime voice.
+    #[serde(default)]
+    pub audio: Option<RealtimeAudioToml>,
+
     /// Experimental / do not use. Overrides only the realtime conversation
     /// websocket transport base URL (the `Op::RealtimeConversation` `/ws`
     /// connection) without changing normal provider HTTP requests.
@@ -1306,6 +1313,19 @@ impl ProjectConfig {
     }
 }
 
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct RealtimeAudioConfig {
+    pub microphone: Option<String>,
+    pub speaker: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)]
+#[schemars(deny_unknown_fields)]
+pub struct RealtimeAudioToml {
+    pub microphone: Option<String>,
+    pub speaker: Option<String>,
+}
+
 #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, JsonSchema)]
 #[schemars(deny_unknown_fields)]
 pub struct ToolsToml {
@@ -2146,6 +2166,12 @@ impl Config {
                 .chatgpt_base_url
                 .or(cfg.chatgpt_base_url)
                 .unwrap_or("https://chatgpt.com/backend-api/".to_string()),
+            realtime_audio: cfg
+                .audio
+                .map_or_else(RealtimeAudioConfig::default, |audio| RealtimeAudioConfig {
+                    microphone: audio.microphone,
+                    speaker: audio.speaker,
+                }),
             experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url,
             experimental_realtime_ws_backend_prompt: cfg.experimental_realtime_ws_backend_prompt,
             forced_chatgpt_workspace_id,
@@ -4766,6 +4792,7 @@ model_verbosity = "high"
                 model_verbosity: None,
                 personality: Some(Personality::Pragmatic),
                 chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
+                realtime_audio: RealtimeAudioConfig::default(),
                 experimental_realtime_ws_base_url: None,
                 experimental_realtime_ws_backend_prompt: None,
                 base_instructions: None,
@@ -4892,6 +4919,7 @@ model_verbosity = "high"
             model_verbosity: None,
             personality: Some(Personality::Pragmatic),
             chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
+            realtime_audio: RealtimeAudioConfig::default(),
             experimental_realtime_ws_base_url: None,
             experimental_realtime_ws_backend_prompt: None,
             base_instructions: None,
@@ -5016,6 +5044,7 @@ model_verbosity = "high"
             model_verbosity: None,
             personality: Some(Personality::Pragmatic),
             chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
+            realtime_audio: RealtimeAudioConfig::default(),
             experimental_realtime_ws_base_url: None,
             experimental_realtime_ws_backend_prompt: None,
             base_instructions: None,
@@ -5126,6 +5155,7 @@ model_verbosity = "high"
             model_verbosity: Some(Verbosity::High),
             personality: Some(Personality::Pragmatic),
             chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
+            realtime_audio: RealtimeAudioConfig::default(),
             experimental_realtime_ws_base_url: None,
             experimental_realtime_ws_backend_prompt: None,
             base_instructions: None,
@@ -5970,6 +6000,39 @@ experimental_realtime_ws_backend_prompt = "prompt from config"
         );
         Ok(())
     }
+
+    #[test]
+    fn realtime_audio_loads_from_config_toml() -> std::io::Result<()> {
+        let cfg: ConfigToml = toml::from_str(
+            r#"
+[audio]
+microphone = "USB Mic"
+speaker = "Desk Speakers"
+"#,
+        )
+        .expect("TOML deserialization should succeed");
+
+        let realtime_audio = cfg
+            .audio
+            .as_ref()
+            .expect("realtime audio config should be present");
+        assert_eq!(realtime_audio.microphone.as_deref(), Some("USB Mic"));
+        assert_eq!(realtime_audio.speaker.as_deref(), Some("Desk Speakers"));
+
+        let codex_home = TempDir::new()?;
+        let config = Config::load_from_base_config_with_overrides(
+            cfg,
+            ConfigOverrides::default(),
+            codex_home.path().to_path_buf(),
+        )?;
+
+        assert_eq!(config.realtime_audio.microphone.as_deref(), Some("USB Mic"));
+        assert_eq!(
+            config.realtime_audio.speaker.as_deref(),
+            Some("Desk Speakers")
+        );
+        Ok(())
+    }
 }
 
 #[cfg(test)]
diff --git a/codex-rs/tui/src/audio_device.rs b/codex-rs/tui/src/audio_device.rs
new file mode 100644
index 000000000..ec8a86ff2
--- /dev/null
+++ b/codex-rs/tui/src/audio_device.rs
@@ -0,0 +1,122 @@
+use codex_core::config::Config;
+use cpal::traits::DeviceTrait;
+use cpal::traits::HostTrait;
+use tracing::warn;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum AudioDeviceKind {
+    Input,
+    Output,
+}
+
+impl AudioDeviceKind {
+    fn noun(self) -> &'static str {
+        match self {
+            Self::Input => "input",
+            Self::Output => "output",
+        }
+    }
+
+    fn configured_name(self, config: &Config) -> Option<&str> {
+        match self {
+            Self::Input => config.realtime_audio.microphone.as_deref(),
+            Self::Output => config.realtime_audio.speaker.as_deref(),
+        }
+    }
+}
+
+pub(crate) fn select_configured_input_device_and_config(
+    config: &Config,
+) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+    select_device_and_config(AudioDeviceKind::Input, config)
+}
+
+pub(crate) fn select_configured_output_device_and_config(
+    config: &Config,
+) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+    select_device_and_config(AudioDeviceKind::Output, config)
+}
+
+fn select_device_and_config(
+    kind: AudioDeviceKind,
+    config: &Config,
+) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+    let host = cpal::default_host();
+    let configured_name = kind.configured_name(config);
+    let selected = configured_name
+        .and_then(|name| find_device_by_name(&host, kind, name))
+        .or_else(|| {
+            let default_device = default_device(&host, kind);
+            if let Some(name) = configured_name
+                && default_device.is_some()
+            {
+                warn!(
+                    "configured {} audio device `{name}` was unavailable; falling back to system default",
+                    kind.noun()
+                );
+            }
+            default_device
+        })
+        .ok_or_else(|| missing_device_error(kind, configured_name))?;
+
+    let stream_config = default_config(&selected, kind)?;
+    Ok((selected, stream_config))
+}
+
+fn find_device_by_name(
+    host: &cpal::Host,
+    kind: AudioDeviceKind,
+    name: &str,
+) -> Option<cpal::Device> {
+    let devices = devices(host, kind).ok()?;
+    devices
+        .into_iter()
+        .find(|device| device.name().ok().as_deref() == Some(name))
+}
+
+fn devices(host: &cpal::Host, kind: AudioDeviceKind) -> Result<Vec<cpal::Device>, String> {
+    match kind {
+        AudioDeviceKind::Input => host
+            .input_devices()
+            .map(|devices| devices.collect())
+            .map_err(|err| format!("failed to enumerate input audio devices: {err}")),
+        AudioDeviceKind::Output => host
+            .output_devices()
+            .map(|devices| devices.collect())
+            .map_err(|err| format!("failed to enumerate output audio devices: {err}")),
+    }
+}
+
+fn default_device(host: &cpal::Host, kind: AudioDeviceKind) -> Option<cpal::Device> {
+    match kind {
+        AudioDeviceKind::Input => host.default_input_device(),
+        AudioDeviceKind::Output => host.default_output_device(),
+    }
+}
+
+fn default_config(
+    device: &cpal::Device,
+    kind: AudioDeviceKind,
+) -> Result<cpal::SupportedStreamConfig, String> {
+    match kind {
+        AudioDeviceKind::Input => device
+            .default_input_config()
+            .map_err(|err| format!("failed to get default input config: {err}")),
+        AudioDeviceKind::Output => device
+            .default_output_config()
+            .map_err(|err| format!("failed to get default output config: {err}")),
+    }
+}
+
+fn missing_device_error(kind: AudioDeviceKind, configured_name: Option<&str>) -> String {
+    match (kind, configured_name) {
+        (AudioDeviceKind::Input, Some(name)) => format!(
+            "configured input audio device `{name}` was unavailable and no default input audio device was found"
+        ),
+        (AudioDeviceKind::Output, Some(name)) => format!(
+            "configured output audio device `{name}` was unavailable and no default output audio device was found"
+        ),
+        (AudioDeviceKind::Input, None) => "no input audio device available".to_string(),
+        (AudioDeviceKind::Output, None) => "no output audio device available".to_string(),
+    }
+}
diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs
index 2cf97188b..e4fd2b631 100644
--- a/codex-rs/tui/src/chatwidget/realtime.rs
+++ b/codex-rs/tui/src/chatwidget/realtime.rs
@@ -207,7 +207,7 @@ impl ChatWidget {
         {
             if self.realtime_conversation.audio_player.is_none() {
                 self.realtime_conversation.audio_player =
-                    crate::voice::RealtimeAudioPlayer::start().ok();
+                    crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
             }
             if let Some(player) = &self.realtime_conversation.audio_player
                 && let Err(err) = player.enqueue_frame(frame)
@@ -231,7 +231,10 @@ impl ChatWidget {
         self.realtime_conversation.meter_placeholder_id = Some(placeholder_id.clone());
         self.request_redraw();
 
-        let capture = match crate::voice::VoiceCapture::start_realtime(self.app_event_tx.clone()) {
+        let capture = match crate::voice::VoiceCapture::start_realtime(
+            &self.config,
+            self.app_event_tx.clone(),
+        ) {
             Ok(capture) => capture,
             Err(err) => {
                 self.remove_transcription_placeholder(&placeholder_id);
@@ -250,7 +253,7 @@ impl ChatWidget {
         self.realtime_conversation.capture = Some(capture);
         if self.realtime_conversation.audio_player.is_none() {
             self.realtime_conversation.audio_player =
-                crate::voice::RealtimeAudioPlayer::start().ok();
+                crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
         }
 
         std::thread::spawn(move || {
diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs
index cf805d0b9..5a4c87aef 100644
--- a/codex-rs/tui/src/lib.rs
+++ b/codex-rs/tui/src/lib.rs
@@ -63,6 +63,8 @@ mod app_backtrack;
 mod app_event;
 mod app_event_sender;
 mod ascii_animation;
+#[cfg(all(not(target_os = "linux"), feature = "voice-input"))]
+mod audio_device;
 mod bottom_pane;
 mod chatwidget;
 mod cli;
@@ -123,6 +125,7 @@ mod voice;
 mod voice {
     use crate::app_event::AppEvent;
     use crate::app_event_sender::AppEventSender;
+    use codex_core::config::Config;
     use codex_protocol::protocol::RealtimeAudioFrame;
     use std::sync::Arc;
     use std::sync::Mutex;
@@ -146,7 +149,7 @@ mod voice {
             Err("voice input is unavailable in this build".to_string())
         }
 
-        pub fn start_realtime(_tx: AppEventSender) -> Result<Self, String> {
+        pub fn start_realtime(_config: &Config, _tx: AppEventSender) -> Result<Self, String> {
             Err("voice input is unavailable in this build".to_string())
         }
 
@@ -186,7 +189,7 @@ mod voice {
     }
 
     impl RealtimeAudioPlayer {
-        pub(crate) fn start() -> Result<Self, String> {
+        pub(crate) fn start(_config: &Config) -> Result<Self, String> {
             Err("voice output is unavailable in this build".to_string())
         }
 
diff --git a/codex-rs/tui/src/voice.rs b/codex-rs/tui/src/voice.rs
index 443ccc88a..6c4236ac8 100644
--- a/codex-rs/tui/src/voice.rs
+++ b/codex-rs/tui/src/voice.rs
@@ -51,7 +51,7 @@ pub struct VoiceCapture {
 
 impl VoiceCapture {
     pub fn start() -> Result<Self, String> {
-        let (device, config) = select_input_device_and_config()?;
+        let (device, config) = select_default_input_device_and_config()?;
 
         let sample_rate = config.sample_rate().0;
         let channels = config.channels();
@@ -74,8 +74,8 @@ impl VoiceCapture {
         })
     }
 
-    pub fn start_realtime(tx: AppEventSender) -> Result<Self, String> {
-        let (device, config) = select_input_device_and_config()?;
+    pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result<Self, String> {
+        let (device, config) = select_realtime_input_device_and_config(config)?;
 
         let sample_rate = config.sample_rate().0;
         let channels = config.channels();
@@ -262,7 +262,8 @@ pub fn transcribe_async(
 // Voice input helpers
 // -------------------------
 
-fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+fn select_default_input_device_and_config()
+-> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
     let host = cpal::default_host();
     let device = host
         .default_input_device()
@@ -273,6 +274,12 @@ fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStre
     Ok((device, config))
 }
 
+fn select_realtime_input_device_and_config(
+    config: &Config,
+) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
+    crate::audio_device::select_configured_input_device_and_config(config)
+}
+
 fn build_input_stream(
     device: &cpal::Device,
     config: &cpal::SupportedStreamConfig,
@@ -466,14 +473,9 @@ pub(crate) struct RealtimeAudioPlayer {
 }
 
 impl RealtimeAudioPlayer {
-    pub(crate) fn start() -> Result<Self, String> {
-        let host = cpal::default_host();
-        let device = host
-            .default_output_device()
-            .ok_or_else(|| "no output audio device available".to_string())?;
-        let config = device
-            .default_output_config()
-            .map_err(|e| format!("failed to get default output config: {e}"))?;
+    pub(crate) fn start(config: &Config) -> Result<Self, String> {
+        let (device, config) =
+            crate::audio_device::select_configured_output_device_and_config(config)?;
         let output_sample_rate = config.sample_rate().0;
         let output_channels = config.channels();
         let queue = Arc::new(Mutex::new(VecDeque::new()));