From a0e86c69fe35e8ccfbed7ed87f07914d467ec488 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Thu, 26 Feb 2026 15:08:21 -0800 Subject: [PATCH] Add realtime audio device config (#12849) ## Summary - add top-level realtime audio config for microphone and speaker selection - apply configured devices when starting realtime capture and playback - keep missing-device behavior on the system default fallback path ## Validation - just write-config-schema - cargo test -p codex-core realtime_audio - cargo test -p codex-tui - just fix -p codex-core - just fix -p codex-tui - just fmt --------- Co-authored-by: Codex --- codex-rs/core/config.schema.json | 21 ++++ codex-rs/core/src/config/edit.rs | 68 +++++++++++++ codex-rs/core/src/config/mod.rs | 63 ++++++++++++ codex-rs/tui/src/audio_device.rs | 122 ++++++++++++++++++++++++ codex-rs/tui/src/chatwidget/realtime.rs | 9 +- codex-rs/tui/src/lib.rs | 7 +- codex-rs/tui/src/voice.rs | 26 ++--- 7 files changed, 299 insertions(+), 17 deletions(-) create mode 100644 codex-rs/tui/src/audio_device.rs diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index b9986e65f..01ac97ef2 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -1179,6 +1179,18 @@ }, "type": "object" }, + "RealtimeAudioToml": { + "additionalProperties": false, + "properties": { + "microphone": { + "type": "string" + }, + "speaker": { + "type": "string" + } + }, + "type": "object" + }, "ReasoningEffort": { "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning", "enum": [ @@ -1540,6 +1552,15 @@ "default": null, "description": "Settings for app-specific controls." }, + "audio": { + "allOf": [ + { + "$ref": "#/definitions/RealtimeAudioToml" + } + ], + "default": null, + "description": "Machine-local realtime audio device preferences used by realtime voice." + }, "background_terminal_max_timeout": { "description": "Maximum poll window for background terminal output (`write_stdin`), in milliseconds. Default: `300000` (5 minutes).", "format": "uint64", diff --git a/codex-rs/core/src/config/edit.rs b/codex-rs/core/src/config/edit.rs index fceb96599..ad404a9f8 100644 --- a/codex-rs/core/src/config/edit.rs +++ b/codex-rs/core/src/config/edit.rs @@ -839,6 +839,30 @@ impl ConfigEditsBuilder { self } + pub fn set_realtime_microphone(mut self, microphone: Option<&str>) -> Self { + let segments = vec!["audio".to_string(), "microphone".to_string()]; + match microphone { + Some(microphone) => self.edits.push(ConfigEdit::SetPath { + segments, + value: value(microphone), + }), + None => self.edits.push(ConfigEdit::ClearPath { segments }), + } + self + } + + pub fn set_realtime_speaker(mut self, speaker: Option<&str>) -> Self { + let segments = vec!["audio".to_string(), "speaker".to_string()]; + match speaker { + Some(speaker) => self.edits.push(ConfigEdit::SetPath { + segments, + value: value(speaker), + }), + None => self.edits.push(ConfigEdit::ClearPath { segments }), + } + self + } + pub fn clear_legacy_windows_sandbox_keys(mut self) -> Self { for key in [ "experimental_windows_sandbox", @@ -1804,6 +1828,50 @@ model_reasoning_effort = "high" assert_eq!(notice, Some(true)); } + #[test] + fn blocking_builder_set_realtime_audio_persists_and_clears() { + let tmp = tempdir().expect("tmpdir"); + let codex_home = tmp.path(); + + ConfigEditsBuilder::new(codex_home) + .set_realtime_microphone(Some("USB Mic")) + .set_realtime_speaker(Some("Desk Speakers")) + .apply_blocking() + .expect("persist realtime audio"); + + let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config"); + let config: TomlValue = toml::from_str(&raw).expect("parse config"); + let realtime_audio = config + .get("audio") + .and_then(TomlValue::as_table) + .expect("audio table should exist"); + assert_eq!( + realtime_audio.get("microphone").and_then(TomlValue::as_str), + Some("USB Mic") + ); + assert_eq!( + realtime_audio.get("speaker").and_then(TomlValue::as_str), + Some("Desk Speakers") + ); + + ConfigEditsBuilder::new(codex_home) + .set_realtime_microphone(None) + .apply_blocking() + .expect("clear realtime microphone"); + + let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config"); + let config: TomlValue = toml::from_str(&raw).expect("parse config"); + let realtime_audio = config + .get("audio") + .and_then(TomlValue::as_table) + .expect("audio table should exist"); + assert_eq!(realtime_audio.get("microphone"), None); + assert_eq!( + realtime_audio.get("speaker").and_then(TomlValue::as_str), + Some("Desk Speakers") + ); + } + #[test] fn replace_mcp_servers_blocking_clears_table_when_empty() { let tmp = tempdir().expect("tmpdir"); diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index c5b22c829..e9da18b5c 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -426,6 +426,9 @@ pub struct Config { /// Base URL for requests to ChatGPT (as opposed to the OpenAI API). pub chatgpt_base_url: String, + /// Machine-local realtime audio device preferences used by realtime voice. + pub realtime_audio: RealtimeAudioConfig, + /// Experimental / do not use. Overrides only the realtime conversation /// websocket transport base URL (the `Op::RealtimeConversation` `/ws` /// connection) without changing normal provider HTTP requests. @@ -1175,6 +1178,10 @@ pub struct ConfigToml { /// Base URL for requests to ChatGPT (as opposed to the OpenAI API). pub chatgpt_base_url: Option, + /// Machine-local realtime audio device preferences used by realtime voice. + #[serde(default)] + pub audio: Option, + /// Experimental / do not use. Overrides only the realtime conversation /// websocket transport base URL (the `Op::RealtimeConversation` `/ws` /// connection) without changing normal provider HTTP requests. @@ -1306,6 +1313,19 @@ impl ProjectConfig { } } +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct RealtimeAudioConfig { + pub microphone: Option, + pub speaker: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)] +#[schemars(deny_unknown_fields)] +pub struct RealtimeAudioToml { + pub microphone: Option, + pub speaker: Option, +} + #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, JsonSchema)] #[schemars(deny_unknown_fields)] pub struct ToolsToml { @@ -2146,6 +2166,12 @@ impl Config { .chatgpt_base_url .or(cfg.chatgpt_base_url) .unwrap_or("https://chatgpt.com/backend-api/".to_string()), + realtime_audio: cfg + .audio + .map_or_else(RealtimeAudioConfig::default, |audio| RealtimeAudioConfig { + microphone: audio.microphone, + speaker: audio.speaker, + }), experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url, experimental_realtime_ws_backend_prompt: cfg.experimental_realtime_ws_backend_prompt, forced_chatgpt_workspace_id, @@ -4766,6 +4792,7 @@ model_verbosity = "high" model_verbosity: None, personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), + realtime_audio: RealtimeAudioConfig::default(), experimental_realtime_ws_base_url: None, experimental_realtime_ws_backend_prompt: None, base_instructions: None, @@ -4892,6 +4919,7 @@ model_verbosity = "high" model_verbosity: None, personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), + realtime_audio: RealtimeAudioConfig::default(), experimental_realtime_ws_base_url: None, experimental_realtime_ws_backend_prompt: None, base_instructions: None, @@ -5016,6 +5044,7 @@ model_verbosity = "high" model_verbosity: None, personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), + realtime_audio: RealtimeAudioConfig::default(), experimental_realtime_ws_base_url: None, experimental_realtime_ws_backend_prompt: None, base_instructions: None, @@ -5126,6 +5155,7 @@ model_verbosity = "high" model_verbosity: Some(Verbosity::High), personality: Some(Personality::Pragmatic), chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(), + realtime_audio: RealtimeAudioConfig::default(), experimental_realtime_ws_base_url: None, experimental_realtime_ws_backend_prompt: None, base_instructions: None, @@ -5970,6 +6000,39 @@ experimental_realtime_ws_backend_prompt = "prompt from config" ); Ok(()) } + + #[test] + fn realtime_audio_loads_from_config_toml() -> std::io::Result<()> { + let cfg: ConfigToml = toml::from_str( + r#" +[audio] +microphone = "USB Mic" +speaker = "Desk Speakers" +"#, + ) + .expect("TOML deserialization should succeed"); + + let realtime_audio = cfg + .audio + .as_ref() + .expect("realtime audio config should be present"); + assert_eq!(realtime_audio.microphone.as_deref(), Some("USB Mic")); + assert_eq!(realtime_audio.speaker.as_deref(), Some("Desk Speakers")); + + let codex_home = TempDir::new()?; + let config = Config::load_from_base_config_with_overrides( + cfg, + ConfigOverrides::default(), + codex_home.path().to_path_buf(), + )?; + + assert_eq!(config.realtime_audio.microphone.as_deref(), Some("USB Mic")); + assert_eq!( + config.realtime_audio.speaker.as_deref(), + Some("Desk Speakers") + ); + Ok(()) + } } #[cfg(test)] diff --git a/codex-rs/tui/src/audio_device.rs b/codex-rs/tui/src/audio_device.rs new file mode 100644 index 000000000..ec8a86ff2 --- /dev/null +++ b/codex-rs/tui/src/audio_device.rs @@ -0,0 +1,122 @@ +use codex_core::config::Config; +use cpal::traits::DeviceTrait; +use cpal::traits::HostTrait; +use tracing::warn; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum AudioDeviceKind { + Input, + Output, +} + +impl AudioDeviceKind { + fn noun(self) -> &'static str { + match self { + Self::Input => "input", + Self::Output => "output", + } + } + + fn configured_name(self, config: &Config) -> Option<&str> { + match self { + Self::Input => config.realtime_audio.microphone.as_deref(), + Self::Output => config.realtime_audio.speaker.as_deref(), + } + } +} + +pub(crate) fn select_configured_input_device_and_config( + config: &Config, +) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { + select_device_and_config(AudioDeviceKind::Input, config) +} + +pub(crate) fn select_configured_output_device_and_config( + config: &Config, +) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { + select_device_and_config(AudioDeviceKind::Output, config) +} + +fn select_device_and_config( + kind: AudioDeviceKind, + config: &Config, +) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { + let host = cpal::default_host(); + let configured_name = kind.configured_name(config); + let selected = configured_name + .and_then(|name| find_device_by_name(&host, kind, name)) + .or_else(|| { + let default_device = default_device(&host, kind); + if let Some(name) = configured_name + && default_device.is_some() + { + warn!( + "configured {} audio device `{name}` was unavailable; falling back to system default", + kind.noun() + ); + } + default_device + }) + .ok_or_else(|| missing_device_error(kind, configured_name))?; + + let stream_config = default_config(&selected, kind)?; + Ok((selected, stream_config)) +} + +fn find_device_by_name( + host: &cpal::Host, + kind: AudioDeviceKind, + name: &str, +) -> Option { + let devices = devices(host, kind).ok()?; + devices + .into_iter() + .find(|device| device.name().ok().as_deref() == Some(name)) +} + +fn devices(host: &cpal::Host, kind: AudioDeviceKind) -> Result, String> { + match kind { + AudioDeviceKind::Input => host + .input_devices() + .map(|devices| devices.collect()) + .map_err(|err| format!("failed to enumerate input audio devices: {err}")), + AudioDeviceKind::Output => host + .output_devices() + .map(|devices| devices.collect()) + .map_err(|err| format!("failed to enumerate output audio devices: {err}")), + } +} + +fn default_device(host: &cpal::Host, kind: AudioDeviceKind) -> Option { + match kind { + AudioDeviceKind::Input => host.default_input_device(), + AudioDeviceKind::Output => host.default_output_device(), + } +} + +fn default_config( + device: &cpal::Device, + kind: AudioDeviceKind, +) -> Result { + match kind { + AudioDeviceKind::Input => device + .default_input_config() + .map_err(|err| format!("failed to get default input config: {err}")), + AudioDeviceKind::Output => device + .default_output_config() + .map_err(|err| format!("failed to get default output config: {err}")), + } +} + +fn missing_device_error(kind: AudioDeviceKind, configured_name: Option<&str>) -> String { + match (kind, configured_name) { + (AudioDeviceKind::Input, Some(name)) => format!( + "configured input audio device `{name}` was unavailable and no default input audio device was found" + ), + (AudioDeviceKind::Output, Some(name)) => format!( + "configured output audio device `{name}` was unavailable and no default output audio device was found" + ), + (AudioDeviceKind::Input, None) => "no input audio device available".to_string(), + (AudioDeviceKind::Output, None) => "no output audio device available".to_string(), + } +} diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index 2cf97188b..e4fd2b631 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -207,7 +207,7 @@ impl ChatWidget { { if self.realtime_conversation.audio_player.is_none() { self.realtime_conversation.audio_player = - crate::voice::RealtimeAudioPlayer::start().ok(); + crate::voice::RealtimeAudioPlayer::start(&self.config).ok(); } if let Some(player) = &self.realtime_conversation.audio_player && let Err(err) = player.enqueue_frame(frame) @@ -231,7 +231,10 @@ impl ChatWidget { self.realtime_conversation.meter_placeholder_id = Some(placeholder_id.clone()); self.request_redraw(); - let capture = match crate::voice::VoiceCapture::start_realtime(self.app_event_tx.clone()) { + let capture = match crate::voice::VoiceCapture::start_realtime( + &self.config, + self.app_event_tx.clone(), + ) { Ok(capture) => capture, Err(err) => { self.remove_transcription_placeholder(&placeholder_id); @@ -250,7 +253,7 @@ impl ChatWidget { self.realtime_conversation.capture = Some(capture); if self.realtime_conversation.audio_player.is_none() { self.realtime_conversation.audio_player = - crate::voice::RealtimeAudioPlayer::start().ok(); + crate::voice::RealtimeAudioPlayer::start(&self.config).ok(); } std::thread::spawn(move || { diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs index cf805d0b9..5a4c87aef 100644 --- a/codex-rs/tui/src/lib.rs +++ b/codex-rs/tui/src/lib.rs @@ -63,6 +63,8 @@ mod app_backtrack; mod app_event; mod app_event_sender; mod ascii_animation; +#[cfg(all(not(target_os = "linux"), feature = "voice-input"))] +mod audio_device; mod bottom_pane; mod chatwidget; mod cli; @@ -123,6 +125,7 @@ mod voice; mod voice { use crate::app_event::AppEvent; use crate::app_event_sender::AppEventSender; + use codex_core::config::Config; use codex_protocol::protocol::RealtimeAudioFrame; use std::sync::Arc; use std::sync::Mutex; @@ -146,7 +149,7 @@ mod voice { Err("voice input is unavailable in this build".to_string()) } - pub fn start_realtime(_tx: AppEventSender) -> Result { + pub fn start_realtime(_config: &Config, _tx: AppEventSender) -> Result { Err("voice input is unavailable in this build".to_string()) } @@ -186,7 +189,7 @@ mod voice { } impl RealtimeAudioPlayer { - pub(crate) fn start() -> Result { + pub(crate) fn start(_config: &Config) -> Result { Err("voice output is unavailable in this build".to_string()) } diff --git a/codex-rs/tui/src/voice.rs b/codex-rs/tui/src/voice.rs index 443ccc88a..6c4236ac8 100644 --- a/codex-rs/tui/src/voice.rs +++ b/codex-rs/tui/src/voice.rs @@ -51,7 +51,7 @@ pub struct VoiceCapture { impl VoiceCapture { pub fn start() -> Result { - let (device, config) = select_input_device_and_config()?; + let (device, config) = select_default_input_device_and_config()?; let sample_rate = config.sample_rate().0; let channels = config.channels(); @@ -74,8 +74,8 @@ impl VoiceCapture { }) } - pub fn start_realtime(tx: AppEventSender) -> Result { - let (device, config) = select_input_device_and_config()?; + pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result { + let (device, config) = select_realtime_input_device_and_config(config)?; let sample_rate = config.sample_rate().0; let channels = config.channels(); @@ -262,7 +262,8 @@ pub fn transcribe_async( // Voice input helpers // ------------------------- -fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { +fn select_default_input_device_and_config() +-> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { let host = cpal::default_host(); let device = host .default_input_device() @@ -273,6 +274,12 @@ fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStre Ok((device, config)) } +fn select_realtime_input_device_and_config( + config: &Config, +) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> { + crate::audio_device::select_configured_input_device_and_config(config) +} + fn build_input_stream( device: &cpal::Device, config: &cpal::SupportedStreamConfig, @@ -466,14 +473,9 @@ pub(crate) struct RealtimeAudioPlayer { } impl RealtimeAudioPlayer { - pub(crate) fn start() -> Result { - let host = cpal::default_host(); - let device = host - .default_output_device() - .ok_or_else(|| "no output audio device available".to_string())?; - let config = device - .default_output_config() - .map_err(|e| format!("failed to get default output config: {e}"))?; + pub(crate) fn start(config: &Config) -> Result { + let (device, config) = + crate::audio_device::select_configured_output_device_and_config(config)?; let output_sample_rate = config.sample_rate().0; let output_channels = config.channels(); let queue = Arc::new(Mutex::new(VecDeque::new()));