treat SIGTERM like ctrl-c for graceful shutdown (#13594)
treat SIGTERM the same as SIGINT for graceful app-server websocket shutdown
This commit is contained in:
parent
926b2f19e8
commit
1980b6ce00
2 changed files with 92 additions and 15 deletions
|
|
@ -124,6 +124,25 @@ enum ShutdownAction {
|
|||
Finish,
|
||||
}
|
||||
|
||||
async fn shutdown_signal() -> IoResult<()> {
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::signal::unix::signal;
|
||||
|
||||
let mut term = signal(SignalKind::terminate())?;
|
||||
tokio::select! {
|
||||
ctrl_c_result = tokio::signal::ctrl_c() => ctrl_c_result,
|
||||
_ = term.recv() => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
{
|
||||
tokio::signal::ctrl_c().await
|
||||
}
|
||||
}
|
||||
|
||||
impl ShutdownState {
|
||||
fn requested(&self) -> bool {
|
||||
self.requested
|
||||
|
|
@ -133,7 +152,7 @@ impl ShutdownState {
|
|||
self.forced
|
||||
}
|
||||
|
||||
fn on_ctrl_c(&mut self, connection_count: usize, running_turn_count: usize) {
|
||||
fn on_signal(&mut self, connection_count: usize, running_turn_count: usize) {
|
||||
if self.requested {
|
||||
self.forced = true;
|
||||
return;
|
||||
|
|
@ -142,7 +161,7 @@ impl ShutdownState {
|
|||
self.requested = true;
|
||||
self.last_logged_running_turn_count = None;
|
||||
info!(
|
||||
"received Ctrl-C; entering graceful restart drain (connections={}, runningAssistantTurns={}, requests still accepted until no assistant turns are running)",
|
||||
"received shutdown signal; entering graceful restart drain (connections={}, runningAssistantTurns={}, requests still accepted until no assistant turns are running)",
|
||||
connection_count, running_turn_count,
|
||||
);
|
||||
}
|
||||
|
|
@ -155,11 +174,11 @@ impl ShutdownState {
|
|||
if self.forced || running_turn_count == 0 {
|
||||
if self.forced {
|
||||
info!(
|
||||
"received second Ctrl-C; forcing restart with {running_turn_count} running assistant turn(s) and {connection_count} connection(s)"
|
||||
"received second shutdown signal; forcing restart with {running_turn_count} running assistant turn(s) and {connection_count} connection(s)"
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"Ctrl-C restart: no assistant turns running; stopping acceptor and disconnecting {connection_count} connection(s)"
|
||||
"shutdown signal restart: no assistant turns running; stopping acceptor and disconnecting {connection_count} connection(s)"
|
||||
);
|
||||
}
|
||||
return ShutdownAction::Finish;
|
||||
|
|
@ -167,7 +186,7 @@ impl ShutdownState {
|
|||
|
||||
if self.last_logged_running_turn_count != Some(running_turn_count) {
|
||||
info!(
|
||||
"Ctrl-C restart: waiting for {running_turn_count} running assistant turn(s) to finish"
|
||||
"shutdown signal restart: waiting for {running_turn_count} running assistant turn(s) to finish"
|
||||
);
|
||||
self.last_logged_running_turn_count = Some(running_turn_count);
|
||||
}
|
||||
|
|
@ -359,8 +378,7 @@ pub async fn run_main_with_transport(
|
|||
};
|
||||
let single_client_mode = matches!(&transport_runtime, TransportRuntime::Stdio);
|
||||
let shutdown_when_no_connections = single_client_mode;
|
||||
let graceful_ctrl_c_restart_enabled = !single_client_mode;
|
||||
|
||||
let graceful_signal_restart_enabled = !single_client_mode;
|
||||
// Parse CLI overrides once and derive the base Config eagerly so later
|
||||
// components do not need to work with raw TOML values.
|
||||
let cli_kv_overrides = cli_config_overrides.parse_overrides().map_err(|e| {
|
||||
|
|
@ -614,14 +632,14 @@ pub async fn run_main_with_transport(
|
|||
}
|
||||
|
||||
tokio::select! {
|
||||
ctrl_c_result = tokio::signal::ctrl_c(), if graceful_ctrl_c_restart_enabled && !shutdown_state.forced() => {
|
||||
if let Err(err) = ctrl_c_result {
|
||||
warn!("failed to listen for Ctrl-C during graceful restart drain: {err}");
|
||||
shutdown_signal_result = shutdown_signal(), if graceful_signal_restart_enabled && !shutdown_state.forced() => {
|
||||
if let Err(err) = shutdown_signal_result {
|
||||
warn!("failed to listen for shutdown signal during graceful restart drain: {err}");
|
||||
}
|
||||
let running_turn_count = *running_turn_count_rx.borrow();
|
||||
shutdown_state.on_ctrl_c(connections.len(), running_turn_count);
|
||||
shutdown_state.on_signal(connections.len(), running_turn_count);
|
||||
}
|
||||
changed = running_turn_count_rx.changed(), if graceful_ctrl_c_restart_enabled && shutdown_state.requested() => {
|
||||
changed = running_turn_count_rx.changed(), if graceful_signal_restart_enabled && shutdown_state.requested() => {
|
||||
if changed.is_err() {
|
||||
warn!("running-turn watcher closed during graceful restart drain");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,6 +83,57 @@ async fn websocket_transport_second_ctrl_c_forces_exit_while_turn_running() -> R
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn websocket_transport_sigterm_waits_for_running_turn_before_exit() -> Result<()> {
|
||||
let GracefulCtrlCFixture {
|
||||
_codex_home,
|
||||
_server,
|
||||
mut process,
|
||||
mut ws,
|
||||
} = start_ctrl_c_restart_fixture(Duration::from_secs(3)).await?;
|
||||
|
||||
send_sigterm(&process)?;
|
||||
assert_process_does_not_exit_within(&mut process, Duration::from_millis(300)).await?;
|
||||
|
||||
let status = wait_for_process_exit_within(
|
||||
&mut process,
|
||||
Duration::from_secs(10),
|
||||
"timed out waiting for graceful SIGTERM restart shutdown",
|
||||
)
|
||||
.await?;
|
||||
assert!(status.success(), "expected graceful exit, got {status}");
|
||||
|
||||
expect_websocket_disconnect(&mut ws).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn websocket_transport_second_sigterm_forces_exit_while_turn_running() -> Result<()> {
|
||||
let GracefulCtrlCFixture {
|
||||
_codex_home,
|
||||
_server,
|
||||
mut process,
|
||||
mut ws,
|
||||
} = start_ctrl_c_restart_fixture(Duration::from_secs(3)).await?;
|
||||
|
||||
send_sigterm(&process)?;
|
||||
assert_process_does_not_exit_within(&mut process, Duration::from_millis(300)).await?;
|
||||
|
||||
send_sigterm(&process)?;
|
||||
let status = wait_for_process_exit_within(
|
||||
&mut process,
|
||||
Duration::from_secs(2),
|
||||
"timed out waiting for forced SIGTERM restart shutdown",
|
||||
)
|
||||
.await?;
|
||||
assert!(status.success(), "expected graceful exit, got {status}");
|
||||
|
||||
expect_websocket_disconnect(&mut ws).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct GracefulCtrlCFixture {
|
||||
_codex_home: TempDir,
|
||||
_server: wiremock::MockServer,
|
||||
|
|
@ -180,16 +231,24 @@ async fn wait_for_responses_post(server: &wiremock::MockServer, wait_for: Durati
|
|||
}
|
||||
|
||||
fn send_sigint(process: &Child) -> Result<()> {
|
||||
send_signal(process, "-INT")
|
||||
}
|
||||
|
||||
fn send_sigterm(process: &Child) -> Result<()> {
|
||||
send_signal(process, "-TERM")
|
||||
}
|
||||
|
||||
fn send_signal(process: &Child, signal: &str) -> Result<()> {
|
||||
let pid = process
|
||||
.id()
|
||||
.context("websocket app-server process has no pid")?;
|
||||
let status = StdCommand::new("kill")
|
||||
.arg("-INT")
|
||||
.arg(signal)
|
||||
.arg(pid.to_string())
|
||||
.status()
|
||||
.context("failed to invoke kill -INT")?;
|
||||
.with_context(|| format!("failed to invoke kill {signal}"))?;
|
||||
if !status.success() {
|
||||
bail!("kill -INT exited with {status}");
|
||||
bail!("kill {signal} exited with {status}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue