Align TUI voice transcription audio with 4o ASR (#13030)

## Summary - switch TUI push-to-talk transcription requests to `gpt-4o-mini-transcribe` - prefer 24 kHz mono `i16` microphone configs and normalize voice input to 24 kHz mono before upload/send - add unit coverage for the new downmix/resample path ## Testing - `just fmt` - `cargo test -p codex-tui`
2026-02-27 10:22:48 -08:00 · 2026-02-27 10:22:48 -08:00 · bc0a5843df
commit bc0a5843df
parent 3b5996f988
2 changed files with 132 additions and 13 deletions
--- a/codex-rs/tui/src/audio_device.rs
+++ b/codex-rs/tui/src/audio_device.rs
@ -5,6 +5,9 @@ use tracing::warn;

 use crate::app_event::RealtimeAudioDeviceKind;

+const PREFERRED_INPUT_SAMPLE_RATE: u32 = 24_000;
+const PREFERRED_INPUT_CHANNELS: u16 = 1;
+
 pub(crate) fn list_realtime_audio_device_names(
    kind: RealtimeAudioDeviceKind,
 ) -> Result<Vec<String>, String> {
@ -33,6 +36,35 @@ pub(crate) fn select_configured_output_device_and_config(
    select_device_and_config(RealtimeAudioDeviceKind::Speaker, config)
 }

+pub(crate) fn preferred_input_config(
+    device: &cpal::Device,
+) -> Result<cpal::SupportedStreamConfig, String> {
+    let supported_configs = device
+        .supported_input_configs()
+        .map_err(|err| format!("failed to enumerate input audio configs: {err}"))?;
+
+    supported_configs
+        .filter_map(|range| {
+            let sample_format_rank = match range.sample_format() {
+                cpal::SampleFormat::I16 => 0u8,
+                cpal::SampleFormat::U16 => 1u8,
+                cpal::SampleFormat::F32 => 2u8,
+                _ => return None,
+            };
+            let sample_rate = preferred_input_sample_rate(&range);
+            let sample_rate_penalty = sample_rate.0.abs_diff(PREFERRED_INPUT_SAMPLE_RATE);
+            let channel_penalty = range.channels().abs_diff(PREFERRED_INPUT_CHANNELS);
+            Some((
+                (sample_rate_penalty, channel_penalty, sample_format_rank),
+                range.with_sample_rate(sample_rate),
+            ))
+        })
+        .min_by_key(|(score, _)| *score)
+        .map(|(_, config)| config)
+        .or_else(|| device.default_input_config().ok())
+        .ok_or_else(|| "failed to get default input config".to_string())
+}
+
 fn select_device_and_config(
    kind: RealtimeAudioDeviceKind,
    config: &Config,
@ -53,7 +85,10 @@ fn select_device_and_config(
        })
        .ok_or_else(|| missing_device_error(kind, configured_name))?;

-    let stream_config = default_config(&selected, kind)?;
+    let stream_config = match kind {
+        RealtimeAudioDeviceKind::Microphone => preferred_input_config(&selected)?,
+        RealtimeAudioDeviceKind::Speaker => default_config(&selected, kind)?,
+    };
    Ok((selected, stream_config))
 }

@ -109,6 +144,18 @@ fn default_config(
    }
 }

+fn preferred_input_sample_rate(range: &cpal::SupportedStreamConfigRange) -> cpal::SampleRate {
+    let min = range.min_sample_rate().0;
+    let max = range.max_sample_rate().0;
+    if (min..=max).contains(&PREFERRED_INPUT_SAMPLE_RATE) {
+        cpal::SampleRate(PREFERRED_INPUT_SAMPLE_RATE)
+    } else if PREFERRED_INPUT_SAMPLE_RATE < min {
+        cpal::SampleRate(min)
+    } else {
+        cpal::SampleRate(max)
+    }
+}
+
 fn missing_device_error(kind: RealtimeAudioDeviceKind, configured_name: Option<&str>) -> String {
    match (kind, configured_name) {
        (RealtimeAudioDeviceKind::Microphone, Some(name)) => {
--- a/codex-rs/tui/src/voice.rs
+++ b/codex-rs/tui/src/voice.rs
@ -27,6 +27,10 @@ use tracing::error;
 use tracing::info;
 use tracing::trace;

+const AUDIO_MODEL: &str = "gpt-4o-mini-transcribe";
+const MODEL_AUDIO_SAMPLE_RATE: u32 = 24_000;
+const MODEL_AUDIO_CHANNELS: u16 = 1;
+
 struct TranscriptionAuthContext {
    mode: AuthMode,
    bearer_token: String,
@ -268,9 +272,7 @@ fn select_default_input_device_and_config()
    let device = host
        .default_input_device()
        .ok_or_else(|| "no input audio device available".to_string())?;
-    let config = device
-        .default_input_config()
-        .map_err(|e| format!("failed to get default input config: {e}"))?;
+    let config = crate::audio_device::preferred_input_config(&device)?;
    Ok((device, config))
 }

@ -395,20 +397,35 @@ fn send_realtime_audio_chunk(
        return;
    }

+    let samples = if sample_rate == MODEL_AUDIO_SAMPLE_RATE && channels == MODEL_AUDIO_CHANNELS {
+        samples
+    } else {
+        convert_pcm16(
+            &samples,
+            sample_rate,
+            channels,
+            MODEL_AUDIO_SAMPLE_RATE,
+            MODEL_AUDIO_CHANNELS,
+        )
+    };
+    if samples.is_empty() {
+        return;
+    }
+
    let mut bytes = Vec::with_capacity(samples.len() * 2);
    for sample in &samples {
        bytes.extend_from_slice(&sample.to_le_bytes());
    }

    let encoded = base64::engine::general_purpose::STANDARD.encode(bytes);
-    let samples_per_channel = (samples.len() / usize::from(channels)) as u32;
+    let samples_per_channel = (samples.len() / usize::from(MODEL_AUDIO_CHANNELS)) as u32;

    tx.send(AppEvent::CodexOp(Op::RealtimeConversationAudio(
        ConversationAudioParams {
            frame: RealtimeAudioFrame {
                data: encoded,
-                sample_rate,
-                num_channels: channels,
+                sample_rate: MODEL_AUDIO_SAMPLE_RATE,
+                num_channels: MODEL_AUDIO_CHANNELS,
                samples_per_channel: Some(samples_per_channel),
            },
        },
@ -505,7 +522,7 @@ impl RealtimeAudioPlayer {
        for pair in raw_bytes.chunks_exact(2) {
            pcm.push(i16::from_le_bytes([pair[0], pair[1]]));
        }
-        let converted = convert_pcm16_for_output(
+        let converted = convert_pcm16(
            &pcm,
            frame.sample_rate,
            frame.num_channels,
@ -598,7 +615,7 @@ fn fill_output_u16(output: &mut [u16], queue: &Arc<Mutex<VecDeque<i16>>>) {
    output.fill(32768);
 }

-fn convert_pcm16_for_output(
+fn convert_pcm16(
    input: &[i16],
    input_sample_rate: u32,
    input_channels: u16,
@ -672,10 +689,29 @@ fn clip_duration_seconds(audio: &RecordedAudio) -> f32 {
 }

 fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
+    let converted;
+    let (channels, sample_rate, segment) =
+        if audio.channels == MODEL_AUDIO_CHANNELS && audio.sample_rate == MODEL_AUDIO_SAMPLE_RATE {
+            (audio.channels, audio.sample_rate, audio.data.as_slice())
+        } else {
+            converted = convert_pcm16(
+                &audio.data,
+                audio.sample_rate,
+                audio.channels,
+                MODEL_AUDIO_SAMPLE_RATE,
+                MODEL_AUDIO_CHANNELS,
+            );
+            (
+                MODEL_AUDIO_CHANNELS,
+                MODEL_AUDIO_SAMPLE_RATE,
+                converted.as_slice(),
+            )
+        };
+
    let mut wav_bytes: Vec<u8> = Vec::new();
    let spec = WavSpec {
-        channels: audio.channels,
-        sample_rate: audio.sample_rate,
+        channels,
+        sample_rate,
        bits_per_sample: 16,
        sample_format: SampleFormat::Int,
    };
@ -684,7 +720,6 @@ fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
        WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?;

    // Simple peak normalization with headroom to improve audibility on quiet inputs.
-    let segment = &audio.data[..];
    let mut peak: i16 = 0;
    for &s in segment {
        let a = s.unsigned_abs();
@ -782,7 +817,7 @@ async fn transcribe_bytes(
                .mime_str("audio/wav")
                .map_err(|e| format!("failed to set mime: {e}"))?;
            let mut form = reqwest::multipart::Form::new()
-                .text("model", "gpt-4o-transcribe")
+                .text("model", AUDIO_MODEL)
                .part("file", part);
            if let Some(context) = context {
                form = form.text("prompt", context);
@ -834,3 +869,40 @@ async fn transcribe_bytes(
        Ok(text)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::RecordedAudio;
+    use super::convert_pcm16;
+    use super::encode_wav_normalized;
+    use pretty_assertions::assert_eq;
+    use std::io::Cursor;
+
+    #[test]
+    fn convert_pcm16_downmixes_and_resamples_for_model_input() {
+        let input = vec![100, 300, 200, 400, 500, 700, 600, 800];
+        let converted = convert_pcm16(&input, 48_000, 2, 24_000, 1);
+        assert_eq!(converted, vec![200, 700]);
+    }
+
+    #[test]
+    fn encode_wav_normalized_outputs_24khz_mono_audio() {
+        let audio = RecordedAudio {
+            data: vec![100, 300, 200, 400, 500, 700, 600, 800],
+            sample_rate: 48_000,
+            channels: 2,
+        };
+
+        let wav = encode_wav_normalized(&audio).expect("wav should encode");
+        let reader = hound::WavReader::new(Cursor::new(wav)).expect("wav should parse");
+        let spec = reader.spec();
+        let samples = reader
+            .into_samples::<i16>()
+            .collect::<Result<Vec<_>, _>>()
+            .expect("samples should decode");
+
+        assert_eq!(spec.channels, 1);
+        assert_eq!(spec.sample_rate, 24_000);
+        assert_eq!(samples, vec![8_426, 29_490]);
+    }
+}