Align TUI voice transcription audio with 4o ASR (#13030)
## Summary - switch TUI push-to-talk transcription requests to `gpt-4o-mini-transcribe` - prefer 24 kHz mono `i16` microphone configs and normalize voice input to 24 kHz mono before upload/send - add unit coverage for the new downmix/resample path ## Testing - `just fmt` - `cargo test -p codex-tui`
This commit is contained in:
parent
3b5996f988
commit
bc0a5843df
2 changed files with 132 additions and 13 deletions
|
|
@ -5,6 +5,9 @@ use tracing::warn;
|
|||
|
||||
use crate::app_event::RealtimeAudioDeviceKind;
|
||||
|
||||
const PREFERRED_INPUT_SAMPLE_RATE: u32 = 24_000;
|
||||
const PREFERRED_INPUT_CHANNELS: u16 = 1;
|
||||
|
||||
pub(crate) fn list_realtime_audio_device_names(
|
||||
kind: RealtimeAudioDeviceKind,
|
||||
) -> Result<Vec<String>, String> {
|
||||
|
|
@ -33,6 +36,35 @@ pub(crate) fn select_configured_output_device_and_config(
|
|||
select_device_and_config(RealtimeAudioDeviceKind::Speaker, config)
|
||||
}
|
||||
|
||||
pub(crate) fn preferred_input_config(
|
||||
device: &cpal::Device,
|
||||
) -> Result<cpal::SupportedStreamConfig, String> {
|
||||
let supported_configs = device
|
||||
.supported_input_configs()
|
||||
.map_err(|err| format!("failed to enumerate input audio configs: {err}"))?;
|
||||
|
||||
supported_configs
|
||||
.filter_map(|range| {
|
||||
let sample_format_rank = match range.sample_format() {
|
||||
cpal::SampleFormat::I16 => 0u8,
|
||||
cpal::SampleFormat::U16 => 1u8,
|
||||
cpal::SampleFormat::F32 => 2u8,
|
||||
_ => return None,
|
||||
};
|
||||
let sample_rate = preferred_input_sample_rate(&range);
|
||||
let sample_rate_penalty = sample_rate.0.abs_diff(PREFERRED_INPUT_SAMPLE_RATE);
|
||||
let channel_penalty = range.channels().abs_diff(PREFERRED_INPUT_CHANNELS);
|
||||
Some((
|
||||
(sample_rate_penalty, channel_penalty, sample_format_rank),
|
||||
range.with_sample_rate(sample_rate),
|
||||
))
|
||||
})
|
||||
.min_by_key(|(score, _)| *score)
|
||||
.map(|(_, config)| config)
|
||||
.or_else(|| device.default_input_config().ok())
|
||||
.ok_or_else(|| "failed to get default input config".to_string())
|
||||
}
|
||||
|
||||
fn select_device_and_config(
|
||||
kind: RealtimeAudioDeviceKind,
|
||||
config: &Config,
|
||||
|
|
@ -53,7 +85,10 @@ fn select_device_and_config(
|
|||
})
|
||||
.ok_or_else(|| missing_device_error(kind, configured_name))?;
|
||||
|
||||
let stream_config = default_config(&selected, kind)?;
|
||||
let stream_config = match kind {
|
||||
RealtimeAudioDeviceKind::Microphone => preferred_input_config(&selected)?,
|
||||
RealtimeAudioDeviceKind::Speaker => default_config(&selected, kind)?,
|
||||
};
|
||||
Ok((selected, stream_config))
|
||||
}
|
||||
|
||||
|
|
@ -109,6 +144,18 @@ fn default_config(
|
|||
}
|
||||
}
|
||||
|
||||
fn preferred_input_sample_rate(range: &cpal::SupportedStreamConfigRange) -> cpal::SampleRate {
|
||||
let min = range.min_sample_rate().0;
|
||||
let max = range.max_sample_rate().0;
|
||||
if (min..=max).contains(&PREFERRED_INPUT_SAMPLE_RATE) {
|
||||
cpal::SampleRate(PREFERRED_INPUT_SAMPLE_RATE)
|
||||
} else if PREFERRED_INPUT_SAMPLE_RATE < min {
|
||||
cpal::SampleRate(min)
|
||||
} else {
|
||||
cpal::SampleRate(max)
|
||||
}
|
||||
}
|
||||
|
||||
fn missing_device_error(kind: RealtimeAudioDeviceKind, configured_name: Option<&str>) -> String {
|
||||
match (kind, configured_name) {
|
||||
(RealtimeAudioDeviceKind::Microphone, Some(name)) => {
|
||||
|
|
|
|||
|
|
@ -27,6 +27,10 @@ use tracing::error;
|
|||
use tracing::info;
|
||||
use tracing::trace;
|
||||
|
||||
const AUDIO_MODEL: &str = "gpt-4o-mini-transcribe";
|
||||
const MODEL_AUDIO_SAMPLE_RATE: u32 = 24_000;
|
||||
const MODEL_AUDIO_CHANNELS: u16 = 1;
|
||||
|
||||
struct TranscriptionAuthContext {
|
||||
mode: AuthMode,
|
||||
bearer_token: String,
|
||||
|
|
@ -268,9 +272,7 @@ fn select_default_input_device_and_config()
|
|||
let device = host
|
||||
.default_input_device()
|
||||
.ok_or_else(|| "no input audio device available".to_string())?;
|
||||
let config = device
|
||||
.default_input_config()
|
||||
.map_err(|e| format!("failed to get default input config: {e}"))?;
|
||||
let config = crate::audio_device::preferred_input_config(&device)?;
|
||||
Ok((device, config))
|
||||
}
|
||||
|
||||
|
|
@ -395,20 +397,35 @@ fn send_realtime_audio_chunk(
|
|||
return;
|
||||
}
|
||||
|
||||
let samples = if sample_rate == MODEL_AUDIO_SAMPLE_RATE && channels == MODEL_AUDIO_CHANNELS {
|
||||
samples
|
||||
} else {
|
||||
convert_pcm16(
|
||||
&samples,
|
||||
sample_rate,
|
||||
channels,
|
||||
MODEL_AUDIO_SAMPLE_RATE,
|
||||
MODEL_AUDIO_CHANNELS,
|
||||
)
|
||||
};
|
||||
if samples.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut bytes = Vec::with_capacity(samples.len() * 2);
|
||||
for sample in &samples {
|
||||
bytes.extend_from_slice(&sample.to_le_bytes());
|
||||
}
|
||||
|
||||
let encoded = base64::engine::general_purpose::STANDARD.encode(bytes);
|
||||
let samples_per_channel = (samples.len() / usize::from(channels)) as u32;
|
||||
let samples_per_channel = (samples.len() / usize::from(MODEL_AUDIO_CHANNELS)) as u32;
|
||||
|
||||
tx.send(AppEvent::CodexOp(Op::RealtimeConversationAudio(
|
||||
ConversationAudioParams {
|
||||
frame: RealtimeAudioFrame {
|
||||
data: encoded,
|
||||
sample_rate,
|
||||
num_channels: channels,
|
||||
sample_rate: MODEL_AUDIO_SAMPLE_RATE,
|
||||
num_channels: MODEL_AUDIO_CHANNELS,
|
||||
samples_per_channel: Some(samples_per_channel),
|
||||
},
|
||||
},
|
||||
|
|
@ -505,7 +522,7 @@ impl RealtimeAudioPlayer {
|
|||
for pair in raw_bytes.chunks_exact(2) {
|
||||
pcm.push(i16::from_le_bytes([pair[0], pair[1]]));
|
||||
}
|
||||
let converted = convert_pcm16_for_output(
|
||||
let converted = convert_pcm16(
|
||||
&pcm,
|
||||
frame.sample_rate,
|
||||
frame.num_channels,
|
||||
|
|
@ -598,7 +615,7 @@ fn fill_output_u16(output: &mut [u16], queue: &Arc<Mutex<VecDeque<i16>>>) {
|
|||
output.fill(32768);
|
||||
}
|
||||
|
||||
fn convert_pcm16_for_output(
|
||||
fn convert_pcm16(
|
||||
input: &[i16],
|
||||
input_sample_rate: u32,
|
||||
input_channels: u16,
|
||||
|
|
@ -672,10 +689,29 @@ fn clip_duration_seconds(audio: &RecordedAudio) -> f32 {
|
|||
}
|
||||
|
||||
fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
|
||||
let converted;
|
||||
let (channels, sample_rate, segment) =
|
||||
if audio.channels == MODEL_AUDIO_CHANNELS && audio.sample_rate == MODEL_AUDIO_SAMPLE_RATE {
|
||||
(audio.channels, audio.sample_rate, audio.data.as_slice())
|
||||
} else {
|
||||
converted = convert_pcm16(
|
||||
&audio.data,
|
||||
audio.sample_rate,
|
||||
audio.channels,
|
||||
MODEL_AUDIO_SAMPLE_RATE,
|
||||
MODEL_AUDIO_CHANNELS,
|
||||
);
|
||||
(
|
||||
MODEL_AUDIO_CHANNELS,
|
||||
MODEL_AUDIO_SAMPLE_RATE,
|
||||
converted.as_slice(),
|
||||
)
|
||||
};
|
||||
|
||||
let mut wav_bytes: Vec<u8> = Vec::new();
|
||||
let spec = WavSpec {
|
||||
channels: audio.channels,
|
||||
sample_rate: audio.sample_rate,
|
||||
channels,
|
||||
sample_rate,
|
||||
bits_per_sample: 16,
|
||||
sample_format: SampleFormat::Int,
|
||||
};
|
||||
|
|
@ -684,7 +720,6 @@ fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
|
|||
WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?;
|
||||
|
||||
// Simple peak normalization with headroom to improve audibility on quiet inputs.
|
||||
let segment = &audio.data[..];
|
||||
let mut peak: i16 = 0;
|
||||
for &s in segment {
|
||||
let a = s.unsigned_abs();
|
||||
|
|
@ -782,7 +817,7 @@ async fn transcribe_bytes(
|
|||
.mime_str("audio/wav")
|
||||
.map_err(|e| format!("failed to set mime: {e}"))?;
|
||||
let mut form = reqwest::multipart::Form::new()
|
||||
.text("model", "gpt-4o-transcribe")
|
||||
.text("model", AUDIO_MODEL)
|
||||
.part("file", part);
|
||||
if let Some(context) = context {
|
||||
form = form.text("prompt", context);
|
||||
|
|
@ -834,3 +869,40 @@ async fn transcribe_bytes(
|
|||
Ok(text)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RecordedAudio;
|
||||
use super::convert_pcm16;
|
||||
use super::encode_wav_normalized;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::io::Cursor;
|
||||
|
||||
#[test]
|
||||
fn convert_pcm16_downmixes_and_resamples_for_model_input() {
|
||||
let input = vec![100, 300, 200, 400, 500, 700, 600, 800];
|
||||
let converted = convert_pcm16(&input, 48_000, 2, 24_000, 1);
|
||||
assert_eq!(converted, vec![200, 700]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn encode_wav_normalized_outputs_24khz_mono_audio() {
|
||||
let audio = RecordedAudio {
|
||||
data: vec![100, 300, 200, 400, 500, 700, 600, 800],
|
||||
sample_rate: 48_000,
|
||||
channels: 2,
|
||||
};
|
||||
|
||||
let wav = encode_wav_normalized(&audio).expect("wav should encode");
|
||||
let reader = hound::WavReader::new(Cursor::new(wav)).expect("wav should parse");
|
||||
let spec = reader.spec();
|
||||
let samples = reader
|
||||
.into_samples::<i16>()
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
.expect("samples should decode");
|
||||
|
||||
assert_eq!(spec.channels, 1);
|
||||
assert_eq!(spec.sample_rate, 24_000);
|
||||
assert_eq!(samples, vec![8_426, 29_490]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue