treat SIGTERM like ctrl-c for graceful shutdown (#13594)

treat SIGTERM the same as SIGINT for graceful app-server websocket
shutdown
This commit is contained in:
Max Johnson 2026-03-05 10:16:58 -08:00 committed by GitHub
parent 926b2f19e8
commit 1980b6ce00
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 92 additions and 15 deletions

View file

@ -124,6 +124,25 @@ enum ShutdownAction {
Finish,
}
async fn shutdown_signal() -> IoResult<()> {
#[cfg(unix)]
{
use tokio::signal::unix::SignalKind;
use tokio::signal::unix::signal;
let mut term = signal(SignalKind::terminate())?;
tokio::select! {
ctrl_c_result = tokio::signal::ctrl_c() => ctrl_c_result,
_ = term.recv() => Ok(()),
}
}
#[cfg(not(unix))]
{
tokio::signal::ctrl_c().await
}
}
impl ShutdownState {
fn requested(&self) -> bool {
self.requested
@ -133,7 +152,7 @@ impl ShutdownState {
self.forced
}
fn on_ctrl_c(&mut self, connection_count: usize, running_turn_count: usize) {
fn on_signal(&mut self, connection_count: usize, running_turn_count: usize) {
if self.requested {
self.forced = true;
return;
@ -142,7 +161,7 @@ impl ShutdownState {
self.requested = true;
self.last_logged_running_turn_count = None;
info!(
"received Ctrl-C; entering graceful restart drain (connections={}, runningAssistantTurns={}, requests still accepted until no assistant turns are running)",
"received shutdown signal; entering graceful restart drain (connections={}, runningAssistantTurns={}, requests still accepted until no assistant turns are running)",
connection_count, running_turn_count,
);
}
@ -155,11 +174,11 @@ impl ShutdownState {
if self.forced || running_turn_count == 0 {
if self.forced {
info!(
"received second Ctrl-C; forcing restart with {running_turn_count} running assistant turn(s) and {connection_count} connection(s)"
"received second shutdown signal; forcing restart with {running_turn_count} running assistant turn(s) and {connection_count} connection(s)"
);
} else {
info!(
"Ctrl-C restart: no assistant turns running; stopping acceptor and disconnecting {connection_count} connection(s)"
"shutdown signal restart: no assistant turns running; stopping acceptor and disconnecting {connection_count} connection(s)"
);
}
return ShutdownAction::Finish;
@ -167,7 +186,7 @@ impl ShutdownState {
if self.last_logged_running_turn_count != Some(running_turn_count) {
info!(
"Ctrl-C restart: waiting for {running_turn_count} running assistant turn(s) to finish"
"shutdown signal restart: waiting for {running_turn_count} running assistant turn(s) to finish"
);
self.last_logged_running_turn_count = Some(running_turn_count);
}
@ -359,8 +378,7 @@ pub async fn run_main_with_transport(
};
let single_client_mode = matches!(&transport_runtime, TransportRuntime::Stdio);
let shutdown_when_no_connections = single_client_mode;
let graceful_ctrl_c_restart_enabled = !single_client_mode;
let graceful_signal_restart_enabled = !single_client_mode;
// Parse CLI overrides once and derive the base Config eagerly so later
// components do not need to work with raw TOML values.
let cli_kv_overrides = cli_config_overrides.parse_overrides().map_err(|e| {
@ -614,14 +632,14 @@ pub async fn run_main_with_transport(
}
tokio::select! {
ctrl_c_result = tokio::signal::ctrl_c(), if graceful_ctrl_c_restart_enabled && !shutdown_state.forced() => {
if let Err(err) = ctrl_c_result {
warn!("failed to listen for Ctrl-C during graceful restart drain: {err}");
shutdown_signal_result = shutdown_signal(), if graceful_signal_restart_enabled && !shutdown_state.forced() => {
if let Err(err) = shutdown_signal_result {
warn!("failed to listen for shutdown signal during graceful restart drain: {err}");
}
let running_turn_count = *running_turn_count_rx.borrow();
shutdown_state.on_ctrl_c(connections.len(), running_turn_count);
shutdown_state.on_signal(connections.len(), running_turn_count);
}
changed = running_turn_count_rx.changed(), if graceful_ctrl_c_restart_enabled && shutdown_state.requested() => {
changed = running_turn_count_rx.changed(), if graceful_signal_restart_enabled && shutdown_state.requested() => {
if changed.is_err() {
warn!("running-turn watcher closed during graceful restart drain");
}

View file

@ -83,6 +83,57 @@ async fn websocket_transport_second_ctrl_c_forces_exit_while_turn_running() -> R
Ok(())
}
#[tokio::test]
async fn websocket_transport_sigterm_waits_for_running_turn_before_exit() -> Result<()> {
let GracefulCtrlCFixture {
_codex_home,
_server,
mut process,
mut ws,
} = start_ctrl_c_restart_fixture(Duration::from_secs(3)).await?;
send_sigterm(&process)?;
assert_process_does_not_exit_within(&mut process, Duration::from_millis(300)).await?;
let status = wait_for_process_exit_within(
&mut process,
Duration::from_secs(10),
"timed out waiting for graceful SIGTERM restart shutdown",
)
.await?;
assert!(status.success(), "expected graceful exit, got {status}");
expect_websocket_disconnect(&mut ws).await?;
Ok(())
}
#[tokio::test]
async fn websocket_transport_second_sigterm_forces_exit_while_turn_running() -> Result<()> {
let GracefulCtrlCFixture {
_codex_home,
_server,
mut process,
mut ws,
} = start_ctrl_c_restart_fixture(Duration::from_secs(3)).await?;
send_sigterm(&process)?;
assert_process_does_not_exit_within(&mut process, Duration::from_millis(300)).await?;
send_sigterm(&process)?;
let status = wait_for_process_exit_within(
&mut process,
Duration::from_secs(2),
"timed out waiting for forced SIGTERM restart shutdown",
)
.await?;
assert!(status.success(), "expected graceful exit, got {status}");
expect_websocket_disconnect(&mut ws).await?;
Ok(())
}
struct GracefulCtrlCFixture {
_codex_home: TempDir,
_server: wiremock::MockServer,
@ -180,16 +231,24 @@ async fn wait_for_responses_post(server: &wiremock::MockServer, wait_for: Durati
}
fn send_sigint(process: &Child) -> Result<()> {
send_signal(process, "-INT")
}
fn send_sigterm(process: &Child) -> Result<()> {
send_signal(process, "-TERM")
}
fn send_signal(process: &Child, signal: &str) -> Result<()> {
let pid = process
.id()
.context("websocket app-server process has no pid")?;
let status = StdCommand::new("kill")
.arg("-INT")
.arg(signal)
.arg(pid.to_string())
.status()
.context("failed to invoke kill -INT")?;
.with_context(|| format!("failed to invoke kill {signal}"))?;
if !status.success() {
bail!("kill -INT exited with {status}");
bail!("kill {signal} exited with {status}");
}
Ok(())
}