Add realtime audio device config (#12849)

## Summary
- add top-level realtime audio config for microphone and speaker
selection
- apply configured devices when starting realtime capture and playback
- keep missing-device behavior on the system default fallback path

## Validation
- just write-config-schema
- cargo test -p codex-core realtime_audio
- cargo test -p codex-tui
- just fix -p codex-core
- just fix -p codex-tui
- just fmt

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim 2026-02-26 15:08:21 -08:00 committed by GitHub
parent fd719d3828
commit a0e86c69fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 299 additions and 17 deletions

View file

@ -1179,6 +1179,18 @@
},
"type": "object"
},
"RealtimeAudioToml": {
"additionalProperties": false,
"properties": {
"microphone": {
"type": "string"
},
"speaker": {
"type": "string"
}
},
"type": "object"
},
"ReasoningEffort": {
"description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning",
"enum": [
@ -1540,6 +1552,15 @@
"default": null,
"description": "Settings for app-specific controls."
},
"audio": {
"allOf": [
{
"$ref": "#/definitions/RealtimeAudioToml"
}
],
"default": null,
"description": "Machine-local realtime audio device preferences used by realtime voice."
},
"background_terminal_max_timeout": {
"description": "Maximum poll window for background terminal output (`write_stdin`), in milliseconds. Default: `300000` (5 minutes).",
"format": "uint64",

View file

@ -839,6 +839,30 @@ impl ConfigEditsBuilder {
self
}
pub fn set_realtime_microphone(mut self, microphone: Option<&str>) -> Self {
let segments = vec!["audio".to_string(), "microphone".to_string()];
match microphone {
Some(microphone) => self.edits.push(ConfigEdit::SetPath {
segments,
value: value(microphone),
}),
None => self.edits.push(ConfigEdit::ClearPath { segments }),
}
self
}
pub fn set_realtime_speaker(mut self, speaker: Option<&str>) -> Self {
let segments = vec!["audio".to_string(), "speaker".to_string()];
match speaker {
Some(speaker) => self.edits.push(ConfigEdit::SetPath {
segments,
value: value(speaker),
}),
None => self.edits.push(ConfigEdit::ClearPath { segments }),
}
self
}
pub fn clear_legacy_windows_sandbox_keys(mut self) -> Self {
for key in [
"experimental_windows_sandbox",
@ -1804,6 +1828,50 @@ model_reasoning_effort = "high"
assert_eq!(notice, Some(true));
}
#[test]
fn blocking_builder_set_realtime_audio_persists_and_clears() {
let tmp = tempdir().expect("tmpdir");
let codex_home = tmp.path();
ConfigEditsBuilder::new(codex_home)
.set_realtime_microphone(Some("USB Mic"))
.set_realtime_speaker(Some("Desk Speakers"))
.apply_blocking()
.expect("persist realtime audio");
let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config");
let config: TomlValue = toml::from_str(&raw).expect("parse config");
let realtime_audio = config
.get("audio")
.and_then(TomlValue::as_table)
.expect("audio table should exist");
assert_eq!(
realtime_audio.get("microphone").and_then(TomlValue::as_str),
Some("USB Mic")
);
assert_eq!(
realtime_audio.get("speaker").and_then(TomlValue::as_str),
Some("Desk Speakers")
);
ConfigEditsBuilder::new(codex_home)
.set_realtime_microphone(None)
.apply_blocking()
.expect("clear realtime microphone");
let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config");
let config: TomlValue = toml::from_str(&raw).expect("parse config");
let realtime_audio = config
.get("audio")
.and_then(TomlValue::as_table)
.expect("audio table should exist");
assert_eq!(realtime_audio.get("microphone"), None);
assert_eq!(
realtime_audio.get("speaker").and_then(TomlValue::as_str),
Some("Desk Speakers")
);
}
#[test]
fn replace_mcp_servers_blocking_clears_table_when_empty() {
let tmp = tempdir().expect("tmpdir");

View file

@ -426,6 +426,9 @@ pub struct Config {
/// Base URL for requests to ChatGPT (as opposed to the OpenAI API).
pub chatgpt_base_url: String,
/// Machine-local realtime audio device preferences used by realtime voice.
pub realtime_audio: RealtimeAudioConfig,
/// Experimental / do not use. Overrides only the realtime conversation
/// websocket transport base URL (the `Op::RealtimeConversation` `/ws`
/// connection) without changing normal provider HTTP requests.
@ -1175,6 +1178,10 @@ pub struct ConfigToml {
/// Base URL for requests to ChatGPT (as opposed to the OpenAI API).
pub chatgpt_base_url: Option<String>,
/// Machine-local realtime audio device preferences used by realtime voice.
#[serde(default)]
pub audio: Option<RealtimeAudioToml>,
/// Experimental / do not use. Overrides only the realtime conversation
/// websocket transport base URL (the `Op::RealtimeConversation` `/ws`
/// connection) without changing normal provider HTTP requests.
@ -1306,6 +1313,19 @@ impl ProjectConfig {
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct RealtimeAudioConfig {
pub microphone: Option<String>,
pub speaker: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)]
#[schemars(deny_unknown_fields)]
pub struct RealtimeAudioToml {
pub microphone: Option<String>,
pub speaker: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, JsonSchema)]
#[schemars(deny_unknown_fields)]
pub struct ToolsToml {
@ -2146,6 +2166,12 @@ impl Config {
.chatgpt_base_url
.or(cfg.chatgpt_base_url)
.unwrap_or("https://chatgpt.com/backend-api/".to_string()),
realtime_audio: cfg
.audio
.map_or_else(RealtimeAudioConfig::default, |audio| RealtimeAudioConfig {
microphone: audio.microphone,
speaker: audio.speaker,
}),
experimental_realtime_ws_base_url: cfg.experimental_realtime_ws_base_url,
experimental_realtime_ws_backend_prompt: cfg.experimental_realtime_ws_backend_prompt,
forced_chatgpt_workspace_id,
@ -4766,6 +4792,7 @@ model_verbosity = "high"
model_verbosity: None,
personality: Some(Personality::Pragmatic),
chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
realtime_audio: RealtimeAudioConfig::default(),
experimental_realtime_ws_base_url: None,
experimental_realtime_ws_backend_prompt: None,
base_instructions: None,
@ -4892,6 +4919,7 @@ model_verbosity = "high"
model_verbosity: None,
personality: Some(Personality::Pragmatic),
chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
realtime_audio: RealtimeAudioConfig::default(),
experimental_realtime_ws_base_url: None,
experimental_realtime_ws_backend_prompt: None,
base_instructions: None,
@ -5016,6 +5044,7 @@ model_verbosity = "high"
model_verbosity: None,
personality: Some(Personality::Pragmatic),
chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
realtime_audio: RealtimeAudioConfig::default(),
experimental_realtime_ws_base_url: None,
experimental_realtime_ws_backend_prompt: None,
base_instructions: None,
@ -5126,6 +5155,7 @@ model_verbosity = "high"
model_verbosity: Some(Verbosity::High),
personality: Some(Personality::Pragmatic),
chatgpt_base_url: "https://chatgpt.com/backend-api/".to_string(),
realtime_audio: RealtimeAudioConfig::default(),
experimental_realtime_ws_base_url: None,
experimental_realtime_ws_backend_prompt: None,
base_instructions: None,
@ -5970,6 +6000,39 @@ experimental_realtime_ws_backend_prompt = "prompt from config"
);
Ok(())
}
#[test]
fn realtime_audio_loads_from_config_toml() -> std::io::Result<()> {
let cfg: ConfigToml = toml::from_str(
r#"
[audio]
microphone = "USB Mic"
speaker = "Desk Speakers"
"#,
)
.expect("TOML deserialization should succeed");
let realtime_audio = cfg
.audio
.as_ref()
.expect("realtime audio config should be present");
assert_eq!(realtime_audio.microphone.as_deref(), Some("USB Mic"));
assert_eq!(realtime_audio.speaker.as_deref(), Some("Desk Speakers"));
let codex_home = TempDir::new()?;
let config = Config::load_from_base_config_with_overrides(
cfg,
ConfigOverrides::default(),
codex_home.path().to_path_buf(),
)?;
assert_eq!(config.realtime_audio.microphone.as_deref(), Some("USB Mic"));
assert_eq!(
config.realtime_audio.speaker.as_deref(),
Some("Desk Speakers")
);
Ok(())
}
}
#[cfg(test)]

View file

@ -0,0 +1,122 @@
use codex_core::config::Config;
use cpal::traits::DeviceTrait;
use cpal::traits::HostTrait;
use tracing::warn;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum AudioDeviceKind {
Input,
Output,
}
impl AudioDeviceKind {
fn noun(self) -> &'static str {
match self {
Self::Input => "input",
Self::Output => "output",
}
}
fn configured_name(self, config: &Config) -> Option<&str> {
match self {
Self::Input => config.realtime_audio.microphone.as_deref(),
Self::Output => config.realtime_audio.speaker.as_deref(),
}
}
}
pub(crate) fn select_configured_input_device_and_config(
config: &Config,
) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
select_device_and_config(AudioDeviceKind::Input, config)
}
pub(crate) fn select_configured_output_device_and_config(
config: &Config,
) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
select_device_and_config(AudioDeviceKind::Output, config)
}
fn select_device_and_config(
kind: AudioDeviceKind,
config: &Config,
) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
let host = cpal::default_host();
let configured_name = kind.configured_name(config);
let selected = configured_name
.and_then(|name| find_device_by_name(&host, kind, name))
.or_else(|| {
let default_device = default_device(&host, kind);
if let Some(name) = configured_name
&& default_device.is_some()
{
warn!(
"configured {} audio device `{name}` was unavailable; falling back to system default",
kind.noun()
);
}
default_device
})
.ok_or_else(|| missing_device_error(kind, configured_name))?;
let stream_config = default_config(&selected, kind)?;
Ok((selected, stream_config))
}
fn find_device_by_name(
host: &cpal::Host,
kind: AudioDeviceKind,
name: &str,
) -> Option<cpal::Device> {
let devices = devices(host, kind).ok()?;
devices
.into_iter()
.find(|device| device.name().ok().as_deref() == Some(name))
}
fn devices(host: &cpal::Host, kind: AudioDeviceKind) -> Result<Vec<cpal::Device>, String> {
match kind {
AudioDeviceKind::Input => host
.input_devices()
.map(|devices| devices.collect())
.map_err(|err| format!("failed to enumerate input audio devices: {err}")),
AudioDeviceKind::Output => host
.output_devices()
.map(|devices| devices.collect())
.map_err(|err| format!("failed to enumerate output audio devices: {err}")),
}
}
fn default_device(host: &cpal::Host, kind: AudioDeviceKind) -> Option<cpal::Device> {
match kind {
AudioDeviceKind::Input => host.default_input_device(),
AudioDeviceKind::Output => host.default_output_device(),
}
}
fn default_config(
device: &cpal::Device,
kind: AudioDeviceKind,
) -> Result<cpal::SupportedStreamConfig, String> {
match kind {
AudioDeviceKind::Input => device
.default_input_config()
.map_err(|err| format!("failed to get default input config: {err}")),
AudioDeviceKind::Output => device
.default_output_config()
.map_err(|err| format!("failed to get default output config: {err}")),
}
}
fn missing_device_error(kind: AudioDeviceKind, configured_name: Option<&str>) -> String {
match (kind, configured_name) {
(AudioDeviceKind::Input, Some(name)) => format!(
"configured input audio device `{name}` was unavailable and no default input audio device was found"
),
(AudioDeviceKind::Output, Some(name)) => format!(
"configured output audio device `{name}` was unavailable and no default output audio device was found"
),
(AudioDeviceKind::Input, None) => "no input audio device available".to_string(),
(AudioDeviceKind::Output, None) => "no output audio device available".to_string(),
}
}

View file

@ -207,7 +207,7 @@ impl ChatWidget {
{
if self.realtime_conversation.audio_player.is_none() {
self.realtime_conversation.audio_player =
crate::voice::RealtimeAudioPlayer::start().ok();
crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
}
if let Some(player) = &self.realtime_conversation.audio_player
&& let Err(err) = player.enqueue_frame(frame)
@ -231,7 +231,10 @@ impl ChatWidget {
self.realtime_conversation.meter_placeholder_id = Some(placeholder_id.clone());
self.request_redraw();
let capture = match crate::voice::VoiceCapture::start_realtime(self.app_event_tx.clone()) {
let capture = match crate::voice::VoiceCapture::start_realtime(
&self.config,
self.app_event_tx.clone(),
) {
Ok(capture) => capture,
Err(err) => {
self.remove_transcription_placeholder(&placeholder_id);
@ -250,7 +253,7 @@ impl ChatWidget {
self.realtime_conversation.capture = Some(capture);
if self.realtime_conversation.audio_player.is_none() {
self.realtime_conversation.audio_player =
crate::voice::RealtimeAudioPlayer::start().ok();
crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
}
std::thread::spawn(move || {

View file

@ -63,6 +63,8 @@ mod app_backtrack;
mod app_event;
mod app_event_sender;
mod ascii_animation;
#[cfg(all(not(target_os = "linux"), feature = "voice-input"))]
mod audio_device;
mod bottom_pane;
mod chatwidget;
mod cli;
@ -123,6 +125,7 @@ mod voice;
mod voice {
use crate::app_event::AppEvent;
use crate::app_event_sender::AppEventSender;
use codex_core::config::Config;
use codex_protocol::protocol::RealtimeAudioFrame;
use std::sync::Arc;
use std::sync::Mutex;
@ -146,7 +149,7 @@ mod voice {
Err("voice input is unavailable in this build".to_string())
}
pub fn start_realtime(_tx: AppEventSender) -> Result<Self, String> {
pub fn start_realtime(_config: &Config, _tx: AppEventSender) -> Result<Self, String> {
Err("voice input is unavailable in this build".to_string())
}
@ -186,7 +189,7 @@ mod voice {
}
impl RealtimeAudioPlayer {
pub(crate) fn start() -> Result<Self, String> {
pub(crate) fn start(_config: &Config) -> Result<Self, String> {
Err("voice output is unavailable in this build".to_string())
}

View file

@ -51,7 +51,7 @@ pub struct VoiceCapture {
impl VoiceCapture {
pub fn start() -> Result<Self, String> {
let (device, config) = select_input_device_and_config()?;
let (device, config) = select_default_input_device_and_config()?;
let sample_rate = config.sample_rate().0;
let channels = config.channels();
@ -74,8 +74,8 @@ impl VoiceCapture {
})
}
pub fn start_realtime(tx: AppEventSender) -> Result<Self, String> {
let (device, config) = select_input_device_and_config()?;
pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result<Self, String> {
let (device, config) = select_realtime_input_device_and_config(config)?;
let sample_rate = config.sample_rate().0;
let channels = config.channels();
@ -262,7 +262,8 @@ pub fn transcribe_async(
// Voice input helpers
// -------------------------
fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
fn select_default_input_device_and_config()
-> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
let host = cpal::default_host();
let device = host
.default_input_device()
@ -273,6 +274,12 @@ fn select_input_device_and_config() -> Result<(cpal::Device, cpal::SupportedStre
Ok((device, config))
}
fn select_realtime_input_device_and_config(
config: &Config,
) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
crate::audio_device::select_configured_input_device_and_config(config)
}
fn build_input_stream(
device: &cpal::Device,
config: &cpal::SupportedStreamConfig,
@ -466,14 +473,9 @@ pub(crate) struct RealtimeAudioPlayer {
}
impl RealtimeAudioPlayer {
pub(crate) fn start() -> Result<Self, String> {
let host = cpal::default_host();
let device = host
.default_output_device()
.ok_or_else(|| "no output audio device available".to_string())?;
let config = device
.default_output_config()
.map_err(|e| format!("failed to get default output config: {e}"))?;
pub(crate) fn start(config: &Config) -> Result<Self, String> {
let (device, config) =
crate::audio_device::select_configured_output_device_and_config(config)?;
let output_sample_rate = config.sample_rate().0;
let output_channels = config.channels();
let queue = Arc::new(Mutex::new(VecDeque::new()));