core-agent-ide/codex-rs/tui/src/chatwidget/realtime.rs
Charley Cunningham 299b8ac445
tui: align pending steers with core acceptance (#12868)
## Summary
- submit `Enter` steers immediately while a turn is already running
instead of routing them through `queued_user_messages`
- keep those submitted steers visible in the footer as `pending_steers`
until core records them as a user message or aborts the turn
- reconcile pending steers on `ItemCompleted(UserMessage)`, not
`RawResponseItem`
- emit user-message item lifecycle for leftover pending input at task
finish, then remove the TUI `TurnComplete` fallback
- keep `queued_user_messages` for actual queued drafts, rendered below
pending steers

## Problem
While the assistant was generating, pressing `Enter` could send the
input into `queued_user_messages`. That queue only drains after the turn
ends, so ordinary steers behaved like queued drafts instead of landing
at the next core sampling boundary.

The first version of this fix also used `RawResponseItem` to decide when
a steer had landed. Review feedback was that this is the wrong
abstraction for client behavior.

There was also a late edge case in core: if pending steer input was
accepted after the final sampling decision but before `TurnComplete`,
core would record that user message into history at task finish without
emitting `ItemStarted(UserMessage)` / `ItemCompleted(UserMessage)`. TUI
had a fallback to paper over that gap locally.

## Approach
- `Enter` during an active turn now submits a normal `Op::UserTurn`
immediately
- TUI keeps a local pending-steer preview instead of rendering that user
message into history immediately
- when core records the steer as `ItemCompleted(UserMessage)`, TUI
matches and removes the corresponding pending preview, then renders the
committed user message
- core now emits the same user-message lifecycle when
`on_task_finished(...)` drains leftover pending user input, before
`TurnComplete`
- with that lifecycle gap closed in core, TUI no longer needs to flush
pending steers into history on `TurnComplete`
- if the turn is interrupted, pending steers and queued drafts are both
restored into the composer, with pending steers first

## Notes
- `Tab` still uses the real queued-message path
- `queued_user_messages` and `pending_steers` are separate state with
separate semantics
- the pending-steer matching key is built directly from `UserInput`
- this removes the new TUI dependency on `RawResponseItem`

## Validation
- `just fmt`
- `cargo test -p codex-core
task_finish_emits_turn_item_lifecycle_for_leftover_pending_user_input --
--nocapture`
- `cargo test -p codex-tui`
2026-03-03 15:31:52 -08:00

425 lines
15 KiB
Rust

use super::*;
use codex_protocol::protocol::ConversationStartParams;
use codex_protocol::protocol::RealtimeAudioFrame;
use codex_protocol::protocol::RealtimeConversationClosedEvent;
use codex_protocol::protocol::RealtimeConversationRealtimeEvent;
use codex_protocol::protocol::RealtimeConversationStartedEvent;
use codex_protocol::protocol::RealtimeEvent;
const REALTIME_CONVERSATION_PROMPT: &str = "You are in a realtime voice conversation in the Codex TUI. Respond conversationally and concisely.";
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub(super) enum RealtimeConversationPhase {
#[default]
Inactive,
Starting,
Active,
Stopping,
}
#[derive(Default)]
pub(super) struct RealtimeConversationUiState {
phase: RealtimeConversationPhase,
requested_close: bool,
session_id: Option<String>,
warned_audio_only_submission: bool,
meter_placeholder_id: Option<String>,
#[cfg(not(target_os = "linux"))]
capture_stop_flag: Option<Arc<AtomicBool>>,
#[cfg(not(target_os = "linux"))]
capture: Option<crate::voice::VoiceCapture>,
#[cfg(not(target_os = "linux"))]
audio_player: Option<crate::voice::RealtimeAudioPlayer>,
}
impl RealtimeConversationUiState {
pub(super) fn is_live(&self) -> bool {
matches!(
self.phase,
RealtimeConversationPhase::Starting
| RealtimeConversationPhase::Active
| RealtimeConversationPhase::Stopping
)
}
pub(super) fn is_active(&self) -> bool {
matches!(self.phase, RealtimeConversationPhase::Active)
}
}
#[derive(Clone, Debug, PartialEq)]
pub(super) struct RenderedUserMessageEvent {
pub(super) message: String,
pub(super) remote_image_urls: Vec<String>,
pub(super) local_images: Vec<PathBuf>,
pub(super) text_elements: Vec<TextElement>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(super) struct PendingSteerCompareKey {
pub(super) message: String,
pub(super) image_count: usize,
}
impl ChatWidget {
pub(super) fn rendered_user_message_event_from_parts(
message: String,
text_elements: Vec<TextElement>,
local_images: Vec<PathBuf>,
remote_image_urls: Vec<String>,
) -> RenderedUserMessageEvent {
RenderedUserMessageEvent {
message,
remote_image_urls,
local_images,
text_elements,
}
}
pub(super) fn rendered_user_message_event_from_event(
event: &UserMessageEvent,
) -> RenderedUserMessageEvent {
Self::rendered_user_message_event_from_parts(
event.message.clone(),
event.text_elements.clone(),
event.local_images.clone(),
event.images.clone().unwrap_or_default(),
)
}
/// Build the compare key for a submitted pending steer without invoking the
/// expensive request-serialization path. Pending steers only need to match the
/// committed `ItemCompleted(UserMessage)` emitted after core drains input, which
/// preserves flattened text and total image count but not UI-only text ranges or
/// local image paths.
pub(super) fn pending_steer_compare_key_from_items(
items: &[UserInput],
) -> PendingSteerCompareKey {
let mut message = String::new();
let mut image_count = 0;
for item in items {
match item {
UserInput::Text { text, .. } => message.push_str(text),
UserInput::Image { .. } | UserInput::LocalImage { .. } => image_count += 1,
UserInput::Skill { .. } | UserInput::Mention { .. } => {}
_ => {}
}
}
PendingSteerCompareKey {
message,
image_count,
}
}
pub(super) fn pending_steer_compare_key_from_item(
item: &codex_protocol::items::UserMessageItem,
) -> PendingSteerCompareKey {
Self::pending_steer_compare_key_from_items(&item.content)
}
#[cfg(test)]
pub(super) fn rendered_user_message_event_from_inputs(
items: &[UserInput],
) -> RenderedUserMessageEvent {
let mut message = String::new();
let mut remote_image_urls = Vec::new();
let mut local_images = Vec::new();
let mut text_elements = Vec::new();
for item in items {
match item {
UserInput::Text {
text,
text_elements: current_text_elements,
} => append_text_with_rebased_elements(
&mut message,
&mut text_elements,
text,
current_text_elements.iter().map(|element| {
TextElement::new(
element.byte_range,
element.placeholder(text).map(str::to_string),
)
}),
),
UserInput::Image { image_url } => remote_image_urls.push(image_url.clone()),
UserInput::LocalImage { path } => local_images.push(path.clone()),
UserInput::Skill { .. } | UserInput::Mention { .. } => {}
_ => {}
}
}
Self::rendered_user_message_event_from_parts(
message,
text_elements,
local_images,
remote_image_urls,
)
}
pub(super) fn should_render_realtime_user_message_event(
&self,
event: &UserMessageEvent,
) -> bool {
if !self.realtime_conversation.is_live() {
return false;
}
let key = Self::rendered_user_message_event_from_event(event);
self.last_rendered_user_message_event.as_ref() != Some(&key)
}
pub(super) fn maybe_defer_user_message_for_realtime(
&mut self,
user_message: UserMessage,
) -> Option<UserMessage> {
if !self.realtime_conversation.is_live() {
return Some(user_message);
}
self.restore_user_message_to_composer(user_message);
if !self.realtime_conversation.warned_audio_only_submission {
self.realtime_conversation.warned_audio_only_submission = true;
self.add_info_message(
"Realtime voice mode is audio-only. Use /realtime to stop.".to_string(),
None,
);
} else {
self.request_redraw();
}
None
}
fn realtime_footer_hint_items() -> Vec<(String, String)> {
vec![("/realtime".to_string(), "stop live voice".to_string())]
}
pub(super) fn start_realtime_conversation(&mut self) {
self.realtime_conversation.phase = RealtimeConversationPhase::Starting;
self.realtime_conversation.requested_close = false;
self.realtime_conversation.session_id = None;
self.realtime_conversation.warned_audio_only_submission = false;
self.set_footer_hint_override(Some(Self::realtime_footer_hint_items()));
self.submit_op(Op::RealtimeConversationStart(ConversationStartParams {
prompt: REALTIME_CONVERSATION_PROMPT.to_string(),
session_id: None,
}));
self.request_redraw();
}
pub(super) fn request_realtime_conversation_close(&mut self, info_message: Option<String>) {
if !self.realtime_conversation.is_live() {
if let Some(message) = info_message {
self.add_info_message(message, None);
}
return;
}
self.realtime_conversation.requested_close = true;
self.realtime_conversation.phase = RealtimeConversationPhase::Stopping;
self.submit_op(Op::RealtimeConversationClose);
self.stop_realtime_local_audio();
self.set_footer_hint_override(None);
if let Some(message) = info_message {
self.add_info_message(message, None);
} else {
self.request_redraw();
}
}
pub(super) fn reset_realtime_conversation_state(&mut self) {
self.stop_realtime_local_audio();
self.set_footer_hint_override(None);
self.realtime_conversation.phase = RealtimeConversationPhase::Inactive;
self.realtime_conversation.requested_close = false;
self.realtime_conversation.session_id = None;
self.realtime_conversation.warned_audio_only_submission = false;
}
pub(super) fn on_realtime_conversation_started(
&mut self,
ev: RealtimeConversationStartedEvent,
) {
if !self.realtime_conversation_enabled() {
self.submit_op(Op::RealtimeConversationClose);
self.reset_realtime_conversation_state();
return;
}
self.realtime_conversation.phase = RealtimeConversationPhase::Active;
self.realtime_conversation.session_id = ev.session_id;
self.realtime_conversation.warned_audio_only_submission = false;
self.set_footer_hint_override(Some(Self::realtime_footer_hint_items()));
self.start_realtime_local_audio();
self.request_redraw();
}
pub(super) fn on_realtime_conversation_realtime(
&mut self,
ev: RealtimeConversationRealtimeEvent,
) {
match ev.payload {
RealtimeEvent::SessionUpdated { session_id, .. } => {
self.realtime_conversation.session_id = Some(session_id);
}
RealtimeEvent::AudioOut(frame) => self.enqueue_realtime_audio_out(&frame),
RealtimeEvent::ConversationItemAdded(_item) => {}
RealtimeEvent::ConversationItemDone { .. } => {}
RealtimeEvent::HandoffRequested(_) => {}
RealtimeEvent::Error(message) => {
self.add_error_message(format!("Realtime voice error: {message}"));
self.reset_realtime_conversation_state();
}
}
}
pub(super) fn on_realtime_conversation_closed(&mut self, ev: RealtimeConversationClosedEvent) {
let requested = self.realtime_conversation.requested_close;
let reason = ev.reason;
self.reset_realtime_conversation_state();
if !requested && let Some(reason) = reason {
self.add_info_message(format!("Realtime voice mode closed: {reason}"), None);
}
self.request_redraw();
}
fn enqueue_realtime_audio_out(&mut self, frame: &RealtimeAudioFrame) {
#[cfg(not(target_os = "linux"))]
{
if self.realtime_conversation.audio_player.is_none() {
self.realtime_conversation.audio_player =
crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
}
if let Some(player) = &self.realtime_conversation.audio_player
&& let Err(err) = player.enqueue_frame(frame)
{
warn!("failed to play realtime audio: {err}");
}
}
#[cfg(target_os = "linux")]
{
let _ = frame;
}
}
#[cfg(not(target_os = "linux"))]
fn start_realtime_local_audio(&mut self) {
if self.realtime_conversation.capture_stop_flag.is_some() {
return;
}
let placeholder_id = self.bottom_pane.insert_transcription_placeholder("⠤⠤⠤⠤");
self.realtime_conversation.meter_placeholder_id = Some(placeholder_id.clone());
self.request_redraw();
let capture = match crate::voice::VoiceCapture::start_realtime(
&self.config,
self.app_event_tx.clone(),
) {
Ok(capture) => capture,
Err(err) => {
self.remove_transcription_placeholder(&placeholder_id);
self.realtime_conversation.meter_placeholder_id = None;
self.add_error_message(format!("Failed to start microphone capture: {err}"));
return;
}
};
let stop_flag = capture.stopped_flag();
let peak = capture.last_peak_arc();
let meter_placeholder_id = placeholder_id;
let app_event_tx = self.app_event_tx.clone();
self.realtime_conversation.capture_stop_flag = Some(stop_flag.clone());
self.realtime_conversation.capture = Some(capture);
if self.realtime_conversation.audio_player.is_none() {
self.realtime_conversation.audio_player =
crate::voice::RealtimeAudioPlayer::start(&self.config).ok();
}
std::thread::spawn(move || {
let mut meter = crate::voice::RecordingMeterState::new();
loop {
if stop_flag.load(Ordering::Relaxed) {
break;
}
let meter_text = meter.next_text(peak.load(Ordering::Relaxed));
app_event_tx.send(AppEvent::UpdateRecordingMeter {
id: meter_placeholder_id.clone(),
text: meter_text,
});
std::thread::sleep(Duration::from_millis(60));
}
});
}
#[cfg(target_os = "linux")]
fn start_realtime_local_audio(&mut self) {}
#[cfg(all(not(target_os = "linux"), feature = "voice-input"))]
pub(crate) fn restart_realtime_audio_device(&mut self, kind: RealtimeAudioDeviceKind) {
if !self.realtime_conversation.is_active() {
return;
}
match kind {
RealtimeAudioDeviceKind::Microphone => {
self.stop_realtime_microphone();
self.start_realtime_local_audio();
}
RealtimeAudioDeviceKind::Speaker => {
self.stop_realtime_speaker();
match crate::voice::RealtimeAudioPlayer::start(&self.config) {
Ok(player) => {
self.realtime_conversation.audio_player = Some(player);
}
Err(err) => {
self.add_error_message(format!("Failed to start speaker output: {err}"));
}
}
}
}
self.request_redraw();
}
#[cfg(any(target_os = "linux", not(feature = "voice-input")))]
pub(crate) fn restart_realtime_audio_device(&mut self, kind: RealtimeAudioDeviceKind) {
let _ = kind;
}
#[cfg(not(target_os = "linux"))]
fn stop_realtime_local_audio(&mut self) {
self.stop_realtime_microphone();
self.stop_realtime_speaker();
}
#[cfg(target_os = "linux")]
fn stop_realtime_local_audio(&mut self) {
self.realtime_conversation.meter_placeholder_id = None;
}
#[cfg(not(target_os = "linux"))]
fn stop_realtime_microphone(&mut self) {
if let Some(flag) = self.realtime_conversation.capture_stop_flag.take() {
flag.store(true, Ordering::Relaxed);
}
if let Some(capture) = self.realtime_conversation.capture.take() {
let _ = capture.stop();
}
if let Some(id) = self.realtime_conversation.meter_placeholder_id.take() {
self.remove_transcription_placeholder(&id);
}
}
#[cfg(not(target_os = "linux"))]
fn stop_realtime_speaker(&mut self) {
if let Some(player) = self.realtime_conversation.audio_player.take() {
player.clear();
}
}
}