From 79d6f80e41806d61b8a5ce7dbaff4bb1d6e38a91 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Thu, 26 Feb 2026 14:11:10 +0000 Subject: [PATCH] chore: clean DB runtime (#12905) --- codex-rs/state/src/runtime.rs | 4363 +------------------- codex-rs/state/src/runtime/agent_jobs.rs | 562 +++ codex-rs/state/src/runtime/backfill.rs | 311 ++ codex-rs/state/src/runtime/logs.rs | 715 ++++ codex-rs/state/src/runtime/memories.rs | 2252 ++++++++++ codex-rs/state/src/runtime/test_support.rs | 64 + codex-rs/state/src/runtime/threads.rs | 496 +++ 7 files changed, 4406 insertions(+), 4357 deletions(-) create mode 100644 codex-rs/state/src/runtime/agent_jobs.rs create mode 100644 codex-rs/state/src/runtime/backfill.rs create mode 100644 codex-rs/state/src/runtime/logs.rs create mode 100644 codex-rs/state/src/runtime/test_support.rs create mode 100644 codex-rs/state/src/runtime/threads.rs diff --git a/codex-rs/state/src/runtime.rs b/codex-rs/state/src/runtime.rs index b7ee2ac88..42d235fc7 100644 --- a/codex-rs/state/src/runtime.rs +++ b/codex-rs/state/src/runtime.rs @@ -18,7 +18,6 @@ use crate::ThreadMetadataBuilder; use crate::ThreadsPage; use crate::apply_rollout_item; use crate::migrations::MIGRATOR; -use crate::model::AgentJobItemRow; use crate::model::AgentJobRow; use crate::model::ThreadRow; use crate::model::anchor_from_item; @@ -48,10 +47,14 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; use tracing::warn; -use uuid::Uuid; +mod agent_jobs; +mod backfill; +mod logs; mod memories; -// Memory-specific CRUD and phase job lifecycle methods live in `runtime/memories.rs`. +#[cfg(test)] +mod test_support; +mod threads; // "Partition" is the retention bucket we cap at 10 MiB: // - one bucket per non-null thread_id @@ -107,1461 +110,6 @@ impl StateRuntime { pub fn codex_home(&self) -> &Path { self.codex_home.as_path() } - - /// Get persisted rollout metadata backfill state. - pub async fn get_backfill_state(&self) -> anyhow::Result { - self.ensure_backfill_state_row().await?; - let row = sqlx::query( - r#" -SELECT status, last_watermark, last_success_at -FROM backfill_state -WHERE id = 1 - "#, - ) - .fetch_one(self.pool.as_ref()) - .await?; - crate::BackfillState::try_from_row(&row) - } - - /// Attempt to claim ownership of rollout metadata backfill. - /// - /// Returns `true` when this runtime claimed the backfill worker slot. - /// Returns `false` if backfill is already complete or currently owned by a - /// non-expired worker. - pub async fn try_claim_backfill(&self, lease_seconds: i64) -> anyhow::Result { - self.ensure_backfill_state_row().await?; - let now = Utc::now().timestamp(); - let lease_cutoff = now.saturating_sub(lease_seconds.max(0)); - let result = sqlx::query( - r#" -UPDATE backfill_state -SET status = ?, updated_at = ? -WHERE id = 1 - AND status != ? - AND (status != ? OR updated_at <= ?) - "#, - ) - .bind(crate::BackfillStatus::Running.as_str()) - .bind(now) - .bind(crate::BackfillStatus::Complete.as_str()) - .bind(crate::BackfillStatus::Running.as_str()) - .bind(lease_cutoff) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() == 1) - } - - /// Mark rollout metadata backfill as running. - pub async fn mark_backfill_running(&self) -> anyhow::Result<()> { - self.ensure_backfill_state_row().await?; - sqlx::query( - r#" -UPDATE backfill_state -SET status = ?, updated_at = ? -WHERE id = 1 - "#, - ) - .bind(crate::BackfillStatus::Running.as_str()) - .bind(Utc::now().timestamp()) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - /// Persist rollout metadata backfill progress. - pub async fn checkpoint_backfill(&self, watermark: &str) -> anyhow::Result<()> { - self.ensure_backfill_state_row().await?; - sqlx::query( - r#" -UPDATE backfill_state -SET status = ?, last_watermark = ?, updated_at = ? -WHERE id = 1 - "#, - ) - .bind(crate::BackfillStatus::Running.as_str()) - .bind(watermark) - .bind(Utc::now().timestamp()) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - /// Mark rollout metadata backfill as complete. - pub async fn mark_backfill_complete(&self, last_watermark: Option<&str>) -> anyhow::Result<()> { - self.ensure_backfill_state_row().await?; - let now = Utc::now().timestamp(); - sqlx::query( - r#" -UPDATE backfill_state -SET - status = ?, - last_watermark = COALESCE(?, last_watermark), - last_success_at = ?, - updated_at = ? -WHERE id = 1 - "#, - ) - .bind(crate::BackfillStatus::Complete.as_str()) - .bind(last_watermark) - .bind(now) - .bind(now) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - /// Load thread metadata by id using the underlying database. - pub async fn get_thread(&self, id: ThreadId) -> anyhow::Result> { - let row = sqlx::query( - r#" -SELECT - id, - rollout_path, - created_at, - updated_at, - source, - agent_nickname, - agent_role, - model_provider, - cwd, - cli_version, - title, - sandbox_policy, - approval_mode, - tokens_used, - first_user_message, - archived_at, - git_sha, - git_branch, - git_origin_url -FROM threads -WHERE id = ? - "#, - ) - .bind(id.to_string()) - .fetch_optional(self.pool.as_ref()) - .await?; - row.map(|row| ThreadRow::try_from_row(&row).and_then(ThreadMetadata::try_from)) - .transpose() - } - - /// Get dynamic tools for a thread, if present. - pub async fn get_dynamic_tools( - &self, - thread_id: ThreadId, - ) -> anyhow::Result>> { - let rows = sqlx::query( - r#" -SELECT name, description, input_schema -FROM thread_dynamic_tools -WHERE thread_id = ? -ORDER BY position ASC - "#, - ) - .bind(thread_id.to_string()) - .fetch_all(self.pool.as_ref()) - .await?; - if rows.is_empty() { - return Ok(None); - } - let mut tools = Vec::with_capacity(rows.len()); - for row in rows { - let input_schema: String = row.try_get("input_schema")?; - let input_schema = serde_json::from_str::(input_schema.as_str())?; - tools.push(DynamicToolSpec { - name: row.try_get("name")?, - description: row.try_get("description")?, - input_schema, - }); - } - Ok(Some(tools)) - } - - /// Find a rollout path by thread id using the underlying database. - pub async fn find_rollout_path_by_id( - &self, - id: ThreadId, - archived_only: Option, - ) -> anyhow::Result> { - let mut builder = - QueryBuilder::::new("SELECT rollout_path FROM threads WHERE id = "); - builder.push_bind(id.to_string()); - match archived_only { - Some(true) => { - builder.push(" AND archived = 1"); - } - Some(false) => { - builder.push(" AND archived = 0"); - } - None => {} - } - let row = builder.build().fetch_optional(self.pool.as_ref()).await?; - Ok(row - .and_then(|r| r.try_get::("rollout_path").ok()) - .map(PathBuf::from)) - } - - /// List threads using the underlying database. - #[allow(clippy::too_many_arguments)] - pub async fn list_threads( - &self, - page_size: usize, - anchor: Option<&crate::Anchor>, - sort_key: crate::SortKey, - allowed_sources: &[String], - model_providers: Option<&[String]>, - archived_only: bool, - search_term: Option<&str>, - ) -> anyhow::Result { - let limit = page_size.saturating_add(1); - - let mut builder = QueryBuilder::::new( - r#" -SELECT - id, - rollout_path, - created_at, - updated_at, - source, - agent_nickname, - agent_role, - model_provider, - cwd, - cli_version, - title, - sandbox_policy, - approval_mode, - tokens_used, - first_user_message, - archived_at, - git_sha, - git_branch, - git_origin_url -FROM threads - "#, - ); - push_thread_filters( - &mut builder, - archived_only, - allowed_sources, - model_providers, - anchor, - sort_key, - search_term, - ); - push_thread_order_and_limit(&mut builder, sort_key, limit); - - let rows = builder.build().fetch_all(self.pool.as_ref()).await?; - let mut items = rows - .into_iter() - .map(|row| ThreadRow::try_from_row(&row).and_then(ThreadMetadata::try_from)) - .collect::, _>>()?; - let num_scanned_rows = items.len(); - let next_anchor = if items.len() > page_size { - items.pop(); - items - .last() - .and_then(|item| anchor_from_item(item, sort_key)) - } else { - None - }; - Ok(ThreadsPage { - items, - next_anchor, - num_scanned_rows, - }) - } - - /// Insert one log entry into the logs table. - pub async fn insert_log(&self, entry: &LogEntry) -> anyhow::Result<()> { - self.insert_logs(std::slice::from_ref(entry)).await - } - - /// Insert a batch of log entries into the logs table. - pub async fn insert_logs(&self, entries: &[LogEntry]) -> anyhow::Result<()> { - if entries.is_empty() { - return Ok(()); - } - - let mut tx = self.pool.begin().await?; - let mut builder = QueryBuilder::::new( - "INSERT INTO logs (ts, ts_nanos, level, target, message, thread_id, process_uuid, module_path, file, line, estimated_bytes) ", - ); - builder.push_values(entries, |mut row, entry| { - let estimated_bytes = entry.message.as_ref().map_or(0, String::len) as i64 - + entry.level.len() as i64 - + entry.target.len() as i64 - + entry.module_path.as_ref().map_or(0, String::len) as i64 - + entry.file.as_ref().map_or(0, String::len) as i64; - row.push_bind(entry.ts) - .push_bind(entry.ts_nanos) - .push_bind(&entry.level) - .push_bind(&entry.target) - .push_bind(&entry.message) - .push_bind(&entry.thread_id) - .push_bind(&entry.process_uuid) - .push_bind(&entry.module_path) - .push_bind(&entry.file) - .push_bind(entry.line) - .push_bind(estimated_bytes); - }); - builder.build().execute(&mut *tx).await?; - self.prune_logs_after_insert(entries, &mut tx).await?; - tx.commit().await?; - Ok(()) - } - - /// Enforce per-partition log size caps after a successful batch insert. - /// - /// We maintain two independent budgets: - /// - Thread logs: rows with `thread_id IS NOT NULL`, capped per `thread_id`. - /// - Threadless process logs: rows with `thread_id IS NULL` ("threadless"), - /// capped per `process_uuid` (including `process_uuid IS NULL` as its own - /// threadless partition). - /// - /// "Threadless" means the log row is not associated with any conversation - /// thread, so retention is keyed by process identity instead. - /// - /// This runs inside the same transaction as the insert so callers never - /// observe "inserted but not yet pruned" rows. - async fn prune_logs_after_insert( - &self, - entries: &[LogEntry], - tx: &mut SqliteConnection, - ) -> anyhow::Result<()> { - let thread_ids: BTreeSet<&str> = entries - .iter() - .filter_map(|entry| entry.thread_id.as_deref()) - .collect(); - if !thread_ids.is_empty() { - // Cheap precheck: only run the heavier window-function prune for - // threads that are currently above the cap. - let mut over_limit_threads_query = - QueryBuilder::::new("SELECT thread_id FROM logs WHERE thread_id IN ("); - { - let mut separated = over_limit_threads_query.separated(", "); - for thread_id in &thread_ids { - separated.push_bind(*thread_id); - } - } - over_limit_threads_query.push(") GROUP BY thread_id HAVING SUM("); - over_limit_threads_query.push("estimated_bytes"); - over_limit_threads_query.push(") > "); - over_limit_threads_query.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); - let over_limit_thread_ids: Vec = over_limit_threads_query - .build() - .fetch_all(&mut *tx) - .await? - .into_iter() - .map(|row| row.try_get("thread_id")) - .collect::>()?; - if !over_limit_thread_ids.is_empty() { - // Enforce a strict per-thread cap by deleting every row whose - // newest-first cumulative bytes exceed the partition budget. - let mut prune_threads = QueryBuilder::::new( - r#" -DELETE FROM logs -WHERE id IN ( - SELECT id - FROM ( - SELECT - id, - SUM( -"#, - ); - prune_threads.push("estimated_bytes"); - prune_threads.push( - r#" - ) OVER ( - PARTITION BY thread_id - ORDER BY ts DESC, ts_nanos DESC, id DESC - ) AS cumulative_bytes - FROM logs - WHERE thread_id IN ( -"#, - ); - { - let mut separated = prune_threads.separated(", "); - for thread_id in &over_limit_thread_ids { - separated.push_bind(thread_id); - } - } - prune_threads.push( - r#" - ) - ) - WHERE cumulative_bytes > -"#, - ); - prune_threads.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); - prune_threads.push("\n)"); - prune_threads.build().execute(&mut *tx).await?; - } - } - - let threadless_process_uuids: BTreeSet<&str> = entries - .iter() - .filter(|entry| entry.thread_id.is_none()) - .filter_map(|entry| entry.process_uuid.as_deref()) - .collect(); - let has_threadless_null_process_uuid = entries - .iter() - .any(|entry| entry.thread_id.is_none() && entry.process_uuid.is_none()); - if !threadless_process_uuids.is_empty() { - // Threadless logs are budgeted separately per process UUID. - let mut over_limit_processes_query = QueryBuilder::::new( - "SELECT process_uuid FROM logs WHERE thread_id IS NULL AND process_uuid IN (", - ); - { - let mut separated = over_limit_processes_query.separated(", "); - for process_uuid in &threadless_process_uuids { - separated.push_bind(*process_uuid); - } - } - over_limit_processes_query.push(") GROUP BY process_uuid HAVING SUM("); - over_limit_processes_query.push("estimated_bytes"); - over_limit_processes_query.push(") > "); - over_limit_processes_query.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); - let over_limit_process_uuids: Vec = over_limit_processes_query - .build() - .fetch_all(&mut *tx) - .await? - .into_iter() - .map(|row| row.try_get("process_uuid")) - .collect::>()?; - if !over_limit_process_uuids.is_empty() { - // Same strict cap policy as thread pruning, but only for - // threadless rows in the affected process UUIDs. - let mut prune_threadless_process_logs = QueryBuilder::::new( - r#" -DELETE FROM logs -WHERE id IN ( - SELECT id - FROM ( - SELECT - id, - SUM( -"#, - ); - prune_threadless_process_logs.push("estimated_bytes"); - prune_threadless_process_logs.push( - r#" - ) OVER ( - PARTITION BY process_uuid - ORDER BY ts DESC, ts_nanos DESC, id DESC - ) AS cumulative_bytes - FROM logs - WHERE thread_id IS NULL - AND process_uuid IN ( -"#, - ); - { - let mut separated = prune_threadless_process_logs.separated(", "); - for process_uuid in &over_limit_process_uuids { - separated.push_bind(process_uuid); - } - } - prune_threadless_process_logs.push( - r#" - ) - ) - WHERE cumulative_bytes > -"#, - ); - prune_threadless_process_logs.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); - prune_threadless_process_logs.push("\n)"); - prune_threadless_process_logs - .build() - .execute(&mut *tx) - .await?; - } - } - if has_threadless_null_process_uuid { - // Rows without a process UUID still need a cap; treat NULL as its - // own threadless partition. - let mut null_process_usage_query = QueryBuilder::::new("SELECT SUM("); - null_process_usage_query.push("estimated_bytes"); - null_process_usage_query.push( - ") AS total_bytes FROM logs WHERE thread_id IS NULL AND process_uuid IS NULL", - ); - let total_null_process_bytes: Option = null_process_usage_query - .build() - .fetch_one(&mut *tx) - .await? - .try_get("total_bytes")?; - - if total_null_process_bytes.unwrap_or(0) > LOG_PARTITION_SIZE_LIMIT_BYTES { - let mut prune_threadless_null_process_logs = QueryBuilder::::new( - r#" -DELETE FROM logs -WHERE id IN ( - SELECT id - FROM ( - SELECT - id, - SUM( -"#, - ); - prune_threadless_null_process_logs.push("estimated_bytes"); - prune_threadless_null_process_logs.push( - r#" - ) OVER ( - PARTITION BY process_uuid - ORDER BY ts DESC, ts_nanos DESC, id DESC - ) AS cumulative_bytes - FROM logs - WHERE thread_id IS NULL - AND process_uuid IS NULL - ) - WHERE cumulative_bytes > -"#, - ); - prune_threadless_null_process_logs.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); - prune_threadless_null_process_logs.push("\n)"); - prune_threadless_null_process_logs - .build() - .execute(&mut *tx) - .await?; - } - } - Ok(()) - } - - pub(crate) async fn delete_logs_before(&self, cutoff_ts: i64) -> anyhow::Result { - let result = sqlx::query("DELETE FROM logs WHERE ts < ?") - .bind(cutoff_ts) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected()) - } - - /// Query logs with optional filters. - pub async fn query_logs(&self, query: &LogQuery) -> anyhow::Result> { - let mut builder = QueryBuilder::::new( - "SELECT id, ts, ts_nanos, level, target, message, thread_id, process_uuid, file, line FROM logs WHERE 1 = 1", - ); - push_log_filters(&mut builder, query); - if query.descending { - builder.push(" ORDER BY id DESC"); - } else { - builder.push(" ORDER BY id ASC"); - } - if let Some(limit) = query.limit { - builder.push(" LIMIT ").push_bind(limit as i64); - } - - let rows = builder - .build_query_as::() - .fetch_all(self.pool.as_ref()) - .await?; - Ok(rows) - } - - /// Return the max log id matching optional filters. - pub async fn max_log_id(&self, query: &LogQuery) -> anyhow::Result { - let mut builder = - QueryBuilder::::new("SELECT MAX(id) AS max_id FROM logs WHERE 1 = 1"); - push_log_filters(&mut builder, query); - let row = builder.build().fetch_one(self.pool.as_ref()).await?; - let max_id: Option = row.try_get("max_id")?; - Ok(max_id.unwrap_or(0)) - } - - /// List thread ids using the underlying database (no rollout scanning). - pub async fn list_thread_ids( - &self, - limit: usize, - anchor: Option<&crate::Anchor>, - sort_key: crate::SortKey, - allowed_sources: &[String], - model_providers: Option<&[String]>, - archived_only: bool, - ) -> anyhow::Result> { - let mut builder = QueryBuilder::::new("SELECT id FROM threads"); - push_thread_filters( - &mut builder, - archived_only, - allowed_sources, - model_providers, - anchor, - sort_key, - None, - ); - push_thread_order_and_limit(&mut builder, sort_key, limit); - - let rows = builder.build().fetch_all(self.pool.as_ref()).await?; - rows.into_iter() - .map(|row| { - let id: String = row.try_get("id")?; - Ok(ThreadId::try_from(id)?) - }) - .collect() - } - - /// Insert or replace thread metadata directly. - pub async fn upsert_thread(&self, metadata: &crate::ThreadMetadata) -> anyhow::Result<()> { - sqlx::query( - r#" -INSERT INTO threads ( - id, - rollout_path, - created_at, - updated_at, - source, - agent_nickname, - agent_role, - model_provider, - cwd, - cli_version, - title, - sandbox_policy, - approval_mode, - tokens_used, - first_user_message, - archived, - archived_at, - git_sha, - git_branch, - git_origin_url -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT(id) DO UPDATE SET - rollout_path = excluded.rollout_path, - created_at = excluded.created_at, - updated_at = excluded.updated_at, - source = excluded.source, - agent_nickname = excluded.agent_nickname, - agent_role = excluded.agent_role, - model_provider = excluded.model_provider, - cwd = excluded.cwd, - cli_version = excluded.cli_version, - title = excluded.title, - sandbox_policy = excluded.sandbox_policy, - approval_mode = excluded.approval_mode, - tokens_used = excluded.tokens_used, - first_user_message = excluded.first_user_message, - archived = excluded.archived, - archived_at = excluded.archived_at, - git_sha = excluded.git_sha, - git_branch = excluded.git_branch, - git_origin_url = excluded.git_origin_url - "#, - ) - .bind(metadata.id.to_string()) - .bind(metadata.rollout_path.display().to_string()) - .bind(datetime_to_epoch_seconds(metadata.created_at)) - .bind(datetime_to_epoch_seconds(metadata.updated_at)) - .bind(metadata.source.as_str()) - .bind(metadata.agent_nickname.as_deref()) - .bind(metadata.agent_role.as_deref()) - .bind(metadata.model_provider.as_str()) - .bind(metadata.cwd.display().to_string()) - .bind(metadata.cli_version.as_str()) - .bind(metadata.title.as_str()) - .bind(metadata.sandbox_policy.as_str()) - .bind(metadata.approval_mode.as_str()) - .bind(metadata.tokens_used) - .bind(metadata.first_user_message.as_deref().unwrap_or_default()) - .bind(metadata.archived_at.is_some()) - .bind(metadata.archived_at.map(datetime_to_epoch_seconds)) - .bind(metadata.git_sha.as_deref()) - .bind(metadata.git_branch.as_deref()) - .bind(metadata.git_origin_url.as_deref()) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - /// Persist dynamic tools for a thread if none have been stored yet. - /// - /// Dynamic tools are defined at thread start and should not change afterward. - /// This only writes the first time we see tools for a given thread. - pub async fn persist_dynamic_tools( - &self, - thread_id: ThreadId, - tools: Option<&[DynamicToolSpec]>, - ) -> anyhow::Result<()> { - let Some(tools) = tools else { - return Ok(()); - }; - if tools.is_empty() { - return Ok(()); - } - let thread_id = thread_id.to_string(); - let mut tx = self.pool.begin().await?; - for (idx, tool) in tools.iter().enumerate() { - let position = i64::try_from(idx).unwrap_or(i64::MAX); - let input_schema = serde_json::to_string(&tool.input_schema)?; - sqlx::query( - r#" -INSERT INTO thread_dynamic_tools ( - thread_id, - position, - name, - description, - input_schema -) VALUES (?, ?, ?, ?, ?) -ON CONFLICT(thread_id, position) DO NOTHING - "#, - ) - .bind(thread_id.as_str()) - .bind(position) - .bind(tool.name.as_str()) - .bind(tool.description.as_str()) - .bind(input_schema) - .execute(&mut *tx) - .await?; - } - tx.commit().await?; - Ok(()) - } - - /// Apply rollout items incrementally using the underlying database. - pub async fn apply_rollout_items( - &self, - builder: &ThreadMetadataBuilder, - items: &[RolloutItem], - otel: Option<&OtelManager>, - ) -> anyhow::Result<()> { - if items.is_empty() { - return Ok(()); - } - let mut metadata = self - .get_thread(builder.id) - .await? - .unwrap_or_else(|| builder.build(&self.default_provider)); - metadata.rollout_path = builder.rollout_path.clone(); - for item in items { - apply_rollout_item(&mut metadata, item, &self.default_provider); - } - if let Some(updated_at) = file_modified_time_utc(builder.rollout_path.as_path()).await { - metadata.updated_at = updated_at; - } - // Keep the thread upsert before dynamic tools to satisfy the foreign key constraint: - // thread_dynamic_tools.thread_id -> threads.id. - if let Err(err) = self.upsert_thread(&metadata).await { - if let Some(otel) = otel { - otel.counter(DB_ERROR_METRIC, 1, &[("stage", "apply_rollout_items")]); - } - return Err(err); - } - let dynamic_tools = extract_dynamic_tools(items); - if let Some(dynamic_tools) = dynamic_tools - && let Err(err) = self - .persist_dynamic_tools(builder.id, dynamic_tools.as_deref()) - .await - { - if let Some(otel) = otel { - otel.counter(DB_ERROR_METRIC, 1, &[("stage", "persist_dynamic_tools")]); - } - return Err(err); - } - Ok(()) - } - - /// Mark a thread as archived using the underlying database. - pub async fn mark_archived( - &self, - thread_id: ThreadId, - rollout_path: &Path, - archived_at: DateTime, - ) -> anyhow::Result<()> { - let Some(mut metadata) = self.get_thread(thread_id).await? else { - return Ok(()); - }; - metadata.archived_at = Some(archived_at); - metadata.rollout_path = rollout_path.to_path_buf(); - if let Some(updated_at) = file_modified_time_utc(rollout_path).await { - metadata.updated_at = updated_at; - } - if metadata.id != thread_id { - warn!( - "thread id mismatch during archive: expected {thread_id}, got {}", - metadata.id - ); - } - self.upsert_thread(&metadata).await - } - - /// Mark a thread as unarchived using the underlying database. - pub async fn mark_unarchived( - &self, - thread_id: ThreadId, - rollout_path: &Path, - ) -> anyhow::Result<()> { - let Some(mut metadata) = self.get_thread(thread_id).await? else { - return Ok(()); - }; - metadata.archived_at = None; - metadata.rollout_path = rollout_path.to_path_buf(); - if let Some(updated_at) = file_modified_time_utc(rollout_path).await { - metadata.updated_at = updated_at; - } - if metadata.id != thread_id { - warn!( - "thread id mismatch during unarchive: expected {thread_id}, got {}", - metadata.id - ); - } - self.upsert_thread(&metadata).await - } - - /// Delete a thread metadata row by id. - pub async fn delete_thread(&self, thread_id: ThreadId) -> anyhow::Result { - let result = sqlx::query("DELETE FROM threads WHERE id = ?") - .bind(thread_id.to_string()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected()) - } - - pub async fn create_agent_job( - &self, - params: &AgentJobCreateParams, - items: &[AgentJobItemCreateParams], - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let input_headers_json = serde_json::to_string(¶ms.input_headers)?; - let output_schema_json = params - .output_schema_json - .as_ref() - .map(serde_json::to_string) - .transpose()?; - let max_runtime_seconds = params - .max_runtime_seconds - .map(i64::try_from) - .transpose() - .map_err(|_| anyhow::anyhow!("invalid max_runtime_seconds value"))?; - let mut tx = self.pool.begin().await?; - sqlx::query( - r#" -INSERT INTO agent_jobs ( - id, - name, - status, - instruction, - auto_export, - max_runtime_seconds, - output_schema_json, - input_headers_json, - input_csv_path, - output_csv_path, - created_at, - updated_at, - started_at, - completed_at, - last_error -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL) - "#, - ) - .bind(params.id.as_str()) - .bind(params.name.as_str()) - .bind(AgentJobStatus::Pending.as_str()) - .bind(params.instruction.as_str()) - .bind(i64::from(params.auto_export)) - .bind(max_runtime_seconds) - .bind(output_schema_json) - .bind(input_headers_json) - .bind(params.input_csv_path.as_str()) - .bind(params.output_csv_path.as_str()) - .bind(now) - .bind(now) - .execute(&mut *tx) - .await?; - - for item in items { - let row_json = serde_json::to_string(&item.row_json)?; - sqlx::query( - r#" -INSERT INTO agent_job_items ( - job_id, - item_id, - row_index, - source_id, - row_json, - status, - assigned_thread_id, - attempt_count, - result_json, - last_error, - created_at, - updated_at, - completed_at, - reported_at -) VALUES (?, ?, ?, ?, ?, ?, NULL, 0, NULL, NULL, ?, ?, NULL, NULL) - "#, - ) - .bind(params.id.as_str()) - .bind(item.item_id.as_str()) - .bind(item.row_index) - .bind(item.source_id.as_deref()) - .bind(row_json) - .bind(AgentJobItemStatus::Pending.as_str()) - .bind(now) - .bind(now) - .execute(&mut *tx) - .await?; - } - - tx.commit().await?; - - let job_id = params.id.as_str(); - self.get_agent_job(job_id) - .await? - .ok_or_else(|| anyhow::anyhow!("failed to load created agent job {job_id}")) - } - - pub async fn get_agent_job(&self, job_id: &str) -> anyhow::Result> { - let row = sqlx::query_as::<_, AgentJobRow>( - r#" -SELECT - id, - name, - status, - instruction, - auto_export, - max_runtime_seconds, - output_schema_json, - input_headers_json, - input_csv_path, - output_csv_path, - created_at, - updated_at, - started_at, - completed_at, - last_error -FROM agent_jobs -WHERE id = ? - "#, - ) - .bind(job_id) - .fetch_optional(self.pool.as_ref()) - .await?; - row.map(AgentJob::try_from).transpose() - } - - pub async fn list_agent_job_items( - &self, - job_id: &str, - status: Option, - limit: Option, - ) -> anyhow::Result> { - let mut builder = QueryBuilder::::new( - r#" -SELECT - job_id, - item_id, - row_index, - source_id, - row_json, - status, - assigned_thread_id, - attempt_count, - result_json, - last_error, - created_at, - updated_at, - completed_at, - reported_at -FROM agent_job_items -WHERE job_id = - "#, - ); - builder.push_bind(job_id); - if let Some(status) = status { - builder.push(" AND status = "); - builder.push_bind(status.as_str()); - } - builder.push(" ORDER BY row_index ASC"); - if let Some(limit) = limit { - builder.push(" LIMIT "); - builder.push_bind(limit as i64); - } - let rows = builder - .build_query_as::() - .fetch_all(self.pool.as_ref()) - .await?; - rows.into_iter().map(AgentJobItem::try_from).collect() - } - - pub async fn get_agent_job_item( - &self, - job_id: &str, - item_id: &str, - ) -> anyhow::Result> { - let row = sqlx::query_as::<_, AgentJobItemRow>( - r#" -SELECT - job_id, - item_id, - row_index, - source_id, - row_json, - status, - assigned_thread_id, - attempt_count, - result_json, - last_error, - created_at, - updated_at, - completed_at, - reported_at -FROM agent_job_items -WHERE job_id = ? AND item_id = ? - "#, - ) - .bind(job_id) - .bind(item_id) - .fetch_optional(self.pool.as_ref()) - .await?; - row.map(AgentJobItem::try_from).transpose() - } - - pub async fn mark_agent_job_running(&self, job_id: &str) -> anyhow::Result<()> { - let now = Utc::now().timestamp(); - sqlx::query( - r#" -UPDATE agent_jobs -SET - status = ?, - updated_at = ?, - started_at = COALESCE(started_at, ?), - completed_at = NULL, - last_error = NULL -WHERE id = ? - "#, - ) - .bind(AgentJobStatus::Running.as_str()) - .bind(now) - .bind(now) - .bind(job_id) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - pub async fn mark_agent_job_completed(&self, job_id: &str) -> anyhow::Result<()> { - let now = Utc::now().timestamp(); - sqlx::query( - r#" -UPDATE agent_jobs -SET status = ?, updated_at = ?, completed_at = ?, last_error = NULL -WHERE id = ? - "#, - ) - .bind(AgentJobStatus::Completed.as_str()) - .bind(now) - .bind(now) - .bind(job_id) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - pub async fn mark_agent_job_failed( - &self, - job_id: &str, - error_message: &str, - ) -> anyhow::Result<()> { - let now = Utc::now().timestamp(); - sqlx::query( - r#" -UPDATE agent_jobs -SET status = ?, updated_at = ?, completed_at = ?, last_error = ? -WHERE id = ? - "#, - ) - .bind(AgentJobStatus::Failed.as_str()) - .bind(now) - .bind(now) - .bind(error_message) - .bind(job_id) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } - - pub async fn mark_agent_job_cancelled( - &self, - job_id: &str, - reason: &str, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_jobs -SET status = ?, updated_at = ?, completed_at = ?, last_error = ? -WHERE id = ? AND status IN (?, ?) - "#, - ) - .bind(AgentJobStatus::Cancelled.as_str()) - .bind(now) - .bind(now) - .bind(reason) - .bind(job_id) - .bind(AgentJobStatus::Pending.as_str()) - .bind(AgentJobStatus::Running.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn is_agent_job_cancelled(&self, job_id: &str) -> anyhow::Result { - let row = sqlx::query( - r#" -SELECT status -FROM agent_jobs -WHERE id = ? - "#, - ) - .bind(job_id) - .fetch_optional(self.pool.as_ref()) - .await?; - let Some(row) = row else { - return Ok(false); - }; - let status: String = row.try_get("status")?; - Ok(AgentJobStatus::parse(status.as_str())? == AgentJobStatus::Cancelled) - } - - pub async fn mark_agent_job_item_running( - &self, - job_id: &str, - item_id: &str, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET - status = ?, - assigned_thread_id = NULL, - attempt_count = attempt_count + 1, - updated_at = ?, - last_error = NULL -WHERE job_id = ? AND item_id = ? AND status = ? - "#, - ) - .bind(AgentJobItemStatus::Running.as_str()) - .bind(now) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Pending.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn mark_agent_job_item_running_with_thread( - &self, - job_id: &str, - item_id: &str, - thread_id: &str, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET - status = ?, - assigned_thread_id = ?, - attempt_count = attempt_count + 1, - updated_at = ?, - last_error = NULL -WHERE job_id = ? AND item_id = ? AND status = ? - "#, - ) - .bind(AgentJobItemStatus::Running.as_str()) - .bind(thread_id) - .bind(now) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Pending.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn mark_agent_job_item_pending( - &self, - job_id: &str, - item_id: &str, - error_message: Option<&str>, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET - status = ?, - assigned_thread_id = NULL, - updated_at = ?, - last_error = ? -WHERE job_id = ? AND item_id = ? AND status = ? - "#, - ) - .bind(AgentJobItemStatus::Pending.as_str()) - .bind(now) - .bind(error_message) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Running.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn set_agent_job_item_thread( - &self, - job_id: &str, - item_id: &str, - thread_id: &str, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET assigned_thread_id = ?, updated_at = ? -WHERE job_id = ? AND item_id = ? AND status = ? - "#, - ) - .bind(thread_id) - .bind(now) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Running.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn report_agent_job_item_result( - &self, - job_id: &str, - item_id: &str, - reporting_thread_id: &str, - result_json: &Value, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let serialized = serde_json::to_string(result_json)?; - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET - result_json = ?, - reported_at = ?, - updated_at = ?, - last_error = NULL -WHERE - job_id = ? - AND item_id = ? - AND status = ? - AND assigned_thread_id = ? - "#, - ) - .bind(serialized) - .bind(now) - .bind(now) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Running.as_str()) - .bind(reporting_thread_id) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn mark_agent_job_item_completed( - &self, - job_id: &str, - item_id: &str, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET - status = ?, - completed_at = ?, - updated_at = ?, - assigned_thread_id = NULL -WHERE - job_id = ? - AND item_id = ? - AND status = ? - AND result_json IS NOT NULL - "#, - ) - .bind(AgentJobItemStatus::Completed.as_str()) - .bind(now) - .bind(now) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Running.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn mark_agent_job_item_failed( - &self, - job_id: &str, - item_id: &str, - error_message: &str, - ) -> anyhow::Result { - let now = Utc::now().timestamp(); - let result = sqlx::query( - r#" -UPDATE agent_job_items -SET - status = ?, - completed_at = ?, - updated_at = ?, - last_error = ?, - assigned_thread_id = NULL -WHERE - job_id = ? - AND item_id = ? - AND status = ? - "#, - ) - .bind(AgentJobItemStatus::Failed.as_str()) - .bind(now) - .bind(now) - .bind(error_message) - .bind(job_id) - .bind(item_id) - .bind(AgentJobItemStatus::Running.as_str()) - .execute(self.pool.as_ref()) - .await?; - Ok(result.rows_affected() > 0) - } - - pub async fn get_agent_job_progress(&self, job_id: &str) -> anyhow::Result { - let row = sqlx::query( - r#" -SELECT - COUNT(*) AS total_items, - SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS pending_items, - SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS running_items, - SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS completed_items, - SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS failed_items -FROM agent_job_items -WHERE job_id = ? - "#, - ) - .bind(AgentJobItemStatus::Pending.as_str()) - .bind(AgentJobItemStatus::Running.as_str()) - .bind(AgentJobItemStatus::Completed.as_str()) - .bind(AgentJobItemStatus::Failed.as_str()) - .bind(job_id) - .fetch_one(self.pool.as_ref()) - .await?; - - let total_items: i64 = row.try_get("total_items")?; - let pending_items: Option = row.try_get("pending_items")?; - let running_items: Option = row.try_get("running_items")?; - let completed_items: Option = row.try_get("completed_items")?; - let failed_items: Option = row.try_get("failed_items")?; - Ok(AgentJobProgress { - total_items: usize::try_from(total_items).unwrap_or_default(), - pending_items: usize::try_from(pending_items.unwrap_or_default()).unwrap_or_default(), - running_items: usize::try_from(running_items.unwrap_or_default()).unwrap_or_default(), - completed_items: usize::try_from(completed_items.unwrap_or_default()) - .unwrap_or_default(), - failed_items: usize::try_from(failed_items.unwrap_or_default()).unwrap_or_default(), - }) - } - - async fn ensure_backfill_state_row(&self) -> anyhow::Result<()> { - sqlx::query( - r#" -INSERT INTO backfill_state (id, status, last_watermark, last_success_at, updated_at) -VALUES (?, ?, NULL, NULL, ?) -ON CONFLICT(id) DO NOTHING - "#, - ) - .bind(1_i64) - .bind(crate::BackfillStatus::Pending.as_str()) - .bind(Utc::now().timestamp()) - .execute(self.pool.as_ref()) - .await?; - Ok(()) - } -} - -fn push_log_filters<'a>(builder: &mut QueryBuilder<'a, Sqlite>, query: &'a LogQuery) { - if let Some(level_upper) = query.level_upper.as_ref() { - builder - .push(" AND UPPER(level) = ") - .push_bind(level_upper.as_str()); - } - if let Some(from_ts) = query.from_ts { - builder.push(" AND ts >= ").push_bind(from_ts); - } - if let Some(to_ts) = query.to_ts { - builder.push(" AND ts <= ").push_bind(to_ts); - } - push_like_filters(builder, "module_path", &query.module_like); - push_like_filters(builder, "file", &query.file_like); - let has_thread_filter = !query.thread_ids.is_empty() || query.include_threadless; - if has_thread_filter { - builder.push(" AND ("); - let mut needs_or = false; - for thread_id in &query.thread_ids { - if needs_or { - builder.push(" OR "); - } - builder.push("thread_id = ").push_bind(thread_id.as_str()); - needs_or = true; - } - if query.include_threadless { - if needs_or { - builder.push(" OR "); - } - builder.push("thread_id IS NULL"); - } - builder.push(")"); - } - if let Some(after_id) = query.after_id { - builder.push(" AND id > ").push_bind(after_id); - } - if let Some(search) = query.search.as_ref() { - builder.push(" AND INSTR(message, "); - builder.push_bind(search.as_str()); - builder.push(") > 0"); - } -} - -fn push_like_filters<'a>( - builder: &mut QueryBuilder<'a, Sqlite>, - column: &str, - filters: &'a [String], -) { - if filters.is_empty() { - return; - } - builder.push(" AND ("); - for (idx, filter) in filters.iter().enumerate() { - if idx > 0 { - builder.push(" OR "); - } - builder - .push(column) - .push(" LIKE '%' || ") - .push_bind(filter.as_str()) - .push(" || '%'"); - } - builder.push(")"); -} - -fn extract_dynamic_tools(items: &[RolloutItem]) -> Option>> { - items.iter().find_map(|item| match item { - RolloutItem::SessionMeta(meta_line) => Some(meta_line.meta.dynamic_tools.clone()), - RolloutItem::ResponseItem(_) - | RolloutItem::Compacted(_) - | RolloutItem::TurnContext(_) - | RolloutItem::EventMsg(_) => None, - }) } async fn open_sqlite(path: &Path) -> anyhow::Result { @@ -1650,2902 +198,3 @@ fn should_remove_state_file(file_name: &str, current_name: &str) -> bool { }; !version_suffix.is_empty() && version_suffix.chars().all(|ch| ch.is_ascii_digit()) } - -fn push_thread_filters<'a>( - builder: &mut QueryBuilder<'a, Sqlite>, - archived_only: bool, - allowed_sources: &'a [String], - model_providers: Option<&'a [String]>, - anchor: Option<&crate::Anchor>, - sort_key: SortKey, - search_term: Option<&'a str>, -) { - builder.push(" WHERE 1 = 1"); - if archived_only { - builder.push(" AND archived = 1"); - } else { - builder.push(" AND archived = 0"); - } - builder.push(" AND first_user_message <> ''"); - if !allowed_sources.is_empty() { - builder.push(" AND source IN ("); - let mut separated = builder.separated(", "); - for source in allowed_sources { - separated.push_bind(source); - } - separated.push_unseparated(")"); - } - if let Some(model_providers) = model_providers - && !model_providers.is_empty() - { - builder.push(" AND model_provider IN ("); - let mut separated = builder.separated(", "); - for provider in model_providers { - separated.push_bind(provider); - } - separated.push_unseparated(")"); - } - if let Some(search_term) = search_term { - builder.push(" AND instr(title, "); - builder.push_bind(search_term); - builder.push(") > 0"); - } - if let Some(anchor) = anchor { - let anchor_ts = datetime_to_epoch_seconds(anchor.ts); - let column = match sort_key { - SortKey::CreatedAt => "created_at", - SortKey::UpdatedAt => "updated_at", - }; - builder.push(" AND ("); - builder.push(column); - builder.push(" < "); - builder.push_bind(anchor_ts); - builder.push(" OR ("); - builder.push(column); - builder.push(" = "); - builder.push_bind(anchor_ts); - builder.push(" AND id < "); - builder.push_bind(anchor.id.to_string()); - builder.push("))"); - } -} - -fn push_thread_order_and_limit( - builder: &mut QueryBuilder<'_, Sqlite>, - sort_key: SortKey, - limit: usize, -) { - let order_column = match sort_key { - SortKey::CreatedAt => "created_at", - SortKey::UpdatedAt => "updated_at", - }; - builder.push(" ORDER BY "); - builder.push(order_column); - builder.push(" DESC, id DESC"); - builder.push(" LIMIT "); - builder.push_bind(limit as i64); -} - -#[cfg(test)] -mod tests { - use super::StateRuntime; - use super::ThreadMetadata; - use super::state_db_filename; - use crate::LogEntry; - use crate::LogQuery; - use crate::STATE_DB_FILENAME; - use crate::STATE_DB_VERSION; - use crate::model::Phase2JobClaimOutcome; - use crate::model::Stage1JobClaimOutcome; - use crate::model::Stage1StartupClaimParams; - use chrono::DateTime; - use chrono::Duration; - use chrono::Utc; - use codex_protocol::ThreadId; - use codex_protocol::protocol::AskForApproval; - use codex_protocol::protocol::SandboxPolicy; - use pretty_assertions::assert_eq; - use sqlx::Row; - use std::path::Path; - use std::path::PathBuf; - use std::sync::Arc; - use std::time::SystemTime; - use std::time::UNIX_EPOCH; - use uuid::Uuid; - - fn unique_temp_dir() -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_or(0, |duration| duration.as_nanos()); - std::env::temp_dir().join(format!( - "codex-state-runtime-test-{nanos}-{}", - Uuid::new_v4() - )) - } - - #[tokio::test] - async fn init_removes_legacy_state_db_files() { - let codex_home = unique_temp_dir(); - tokio::fs::create_dir_all(&codex_home) - .await - .expect("create codex_home"); - - let current_name = state_db_filename(); - let previous_version = STATE_DB_VERSION.saturating_sub(1); - let unversioned_name = format!("{STATE_DB_FILENAME}.sqlite"); - for suffix in ["", "-wal", "-shm", "-journal"] { - let path = codex_home.join(format!("{unversioned_name}{suffix}")); - tokio::fs::write(path, b"legacy") - .await - .expect("write legacy"); - let old_version_path = codex_home.join(format!( - "{STATE_DB_FILENAME}_{previous_version}.sqlite{suffix}" - )); - tokio::fs::write(old_version_path, b"old_version") - .await - .expect("write old version"); - } - let unrelated_path = codex_home.join("state.sqlite_backup"); - tokio::fs::write(&unrelated_path, b"keep") - .await - .expect("write unrelated"); - let numeric_path = codex_home.join("123"); - tokio::fs::write(&numeric_path, b"keep") - .await - .expect("write numeric"); - - let _runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - for suffix in ["", "-wal", "-shm", "-journal"] { - let legacy_path = codex_home.join(format!("{unversioned_name}{suffix}")); - assert_eq!( - tokio::fs::try_exists(&legacy_path) - .await - .expect("check legacy path"), - false - ); - let old_version_path = codex_home.join(format!( - "{STATE_DB_FILENAME}_{previous_version}.sqlite{suffix}" - )); - assert_eq!( - tokio::fs::try_exists(&old_version_path) - .await - .expect("check old version path"), - false - ); - } - assert_eq!( - tokio::fs::try_exists(codex_home.join(current_name)) - .await - .expect("check new db path"), - true - ); - assert_eq!( - tokio::fs::try_exists(&unrelated_path) - .await - .expect("check unrelated path"), - true - ); - assert_eq!( - tokio::fs::try_exists(&numeric_path) - .await - .expect("check numeric path"), - true - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn backfill_state_persists_progress_and_completion() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let initial = runtime - .get_backfill_state() - .await - .expect("get initial backfill state"); - assert_eq!(initial.status, crate::BackfillStatus::Pending); - assert_eq!(initial.last_watermark, None); - assert_eq!(initial.last_success_at, None); - - runtime - .mark_backfill_running() - .await - .expect("mark backfill running"); - runtime - .checkpoint_backfill("sessions/2026/01/27/rollout-a.jsonl") - .await - .expect("checkpoint backfill"); - - let running = runtime - .get_backfill_state() - .await - .expect("get running backfill state"); - assert_eq!(running.status, crate::BackfillStatus::Running); - assert_eq!( - running.last_watermark, - Some("sessions/2026/01/27/rollout-a.jsonl".to_string()) - ); - assert_eq!(running.last_success_at, None); - - runtime - .mark_backfill_complete(Some("sessions/2026/01/28/rollout-b.jsonl")) - .await - .expect("mark backfill complete"); - let completed = runtime - .get_backfill_state() - .await - .expect("get completed backfill state"); - assert_eq!(completed.status, crate::BackfillStatus::Complete); - assert_eq!( - completed.last_watermark, - Some("sessions/2026/01/28/rollout-b.jsonl".to_string()) - ); - assert!(completed.last_success_at.is_some()); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn backfill_claim_is_singleton_until_stale_and_blocked_when_complete() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let claimed = runtime - .try_claim_backfill(3600) - .await - .expect("initial backfill claim"); - assert_eq!(claimed, true); - - let duplicate_claim = runtime - .try_claim_backfill(3600) - .await - .expect("duplicate backfill claim"); - assert_eq!(duplicate_claim, false); - - let stale_updated_at = Utc::now().timestamp().saturating_sub(10_000); - sqlx::query( - r#" -UPDATE backfill_state -SET status = ?, updated_at = ? -WHERE id = 1 - "#, - ) - .bind(crate::BackfillStatus::Running.as_str()) - .bind(stale_updated_at) - .execute(runtime.pool.as_ref()) - .await - .expect("force stale backfill lease"); - - let stale_claim = runtime - .try_claim_backfill(10) - .await - .expect("stale backfill claim"); - assert_eq!(stale_claim, true); - - runtime - .mark_backfill_complete(None) - .await - .expect("mark complete"); - let claim_after_complete = runtime - .try_claim_backfill(3600) - .await - .expect("claim after complete"); - assert_eq!(claim_after_complete, false); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn stage1_claim_skips_when_up_to_date() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let metadata = test_thread_metadata(&codex_home, thread_id, codex_home.join("a")); - runtime - .upsert_thread(&metadata) - .await - .expect("upsert thread"); - - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - - let claim = runtime - .try_claim_stage1_job(thread_id, owner_a, 100, 3600, 64) - .await - .expect("claim stage1 job"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected claim outcome: {other:?}"), - }; - - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - ownership_token.as_str(), - 100, - "raw", - "sum", - None, - ) - .await - .expect("mark stage1 succeeded"), - "stage1 success should finalize for current token" - ); - - let up_to_date = runtime - .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) - .await - .expect("claim stage1 up-to-date"); - assert_eq!(up_to_date, Stage1JobClaimOutcome::SkippedUpToDate); - - let needs_rerun = runtime - .try_claim_stage1_job(thread_id, owner_b, 101, 3600, 64) - .await - .expect("claim stage1 newer source"); - assert!( - matches!(needs_rerun, Stage1JobClaimOutcome::Claimed { .. }), - "newer source_updated_at should be claimable" - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn stage1_running_stale_can_be_stolen_but_fresh_running_is_skipped() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let cwd = codex_home.join("workspace"); - runtime - .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd)) - .await - .expect("upsert thread"); - - let claim_a = runtime - .try_claim_stage1_job(thread_id, owner_a, 100, 3600, 64) - .await - .expect("claim a"); - assert!(matches!(claim_a, Stage1JobClaimOutcome::Claimed { .. })); - - let claim_b_fresh = runtime - .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) - .await - .expect("claim b fresh"); - assert_eq!(claim_b_fresh, Stage1JobClaimOutcome::SkippedRunning); - - sqlx::query("UPDATE jobs SET lease_until = 0 WHERE kind = 'memory_stage1' AND job_key = ?") - .bind(thread_id.to_string()) - .execute(runtime.pool.as_ref()) - .await - .expect("force stale lease"); - - let claim_b_stale = runtime - .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) - .await - .expect("claim b stale"); - assert!(matches!( - claim_b_stale, - Stage1JobClaimOutcome::Claimed { .. } - )); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn stage1_concurrent_claim_for_same_thread_is_conflict_safe() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let thread_id_a = thread_id; - let thread_id_b = thread_id; - let runtime_a = Arc::clone(&runtime); - let runtime_b = Arc::clone(&runtime); - let claim_with_retry = |runtime: Arc, - thread_id: ThreadId, - owner: ThreadId| async move { - for attempt in 0..5 { - match runtime - .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64) - .await - { - Ok(outcome) => return outcome, - Err(err) if err.to_string().contains("database is locked") && attempt < 4 => { - tokio::time::sleep(std::time::Duration::from_millis(10)).await; - } - Err(err) => panic!("claim stage1 should not fail: {err}"), - } - } - panic!("claim stage1 should have returned within retry budget") - }; - - let (claim_a, claim_b) = tokio::join!( - claim_with_retry(runtime_a, thread_id_a, owner_a), - claim_with_retry(runtime_b, thread_id_b, owner_b), - ); - - let claim_outcomes = vec![claim_a, claim_b]; - let claimed_count = claim_outcomes - .iter() - .filter(|outcome| matches!(outcome, Stage1JobClaimOutcome::Claimed { .. })) - .count(); - assert_eq!(claimed_count, 1); - assert!( - claim_outcomes.iter().all(|outcome| { - matches!( - outcome, - Stage1JobClaimOutcome::Claimed { .. } | Stage1JobClaimOutcome::SkippedRunning - ) - }), - "unexpected claim outcomes: {claim_outcomes:?}" - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn stage1_concurrent_claims_respect_running_cap() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let thread_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_a, - codex_home.join("workspace-a"), - )) - .await - .expect("upsert thread a"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_b, - codex_home.join("workspace-b"), - )) - .await - .expect("upsert thread b"); - - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let runtime_a = Arc::clone(&runtime); - let runtime_b = Arc::clone(&runtime); - - let (claim_a, claim_b) = tokio::join!( - async move { - runtime_a - .try_claim_stage1_job(thread_a, owner_a, 100, 3_600, 1) - .await - .expect("claim stage1 thread a") - }, - async move { - runtime_b - .try_claim_stage1_job(thread_b, owner_b, 101, 3_600, 1) - .await - .expect("claim stage1 thread b") - }, - ); - - let claim_outcomes = vec![claim_a, claim_b]; - let claimed_count = claim_outcomes - .iter() - .filter(|outcome| matches!(outcome, Stage1JobClaimOutcome::Claimed { .. })) - .count(); - assert_eq!(claimed_count, 1); - assert!( - claim_outcomes - .iter() - .any(|outcome| { matches!(outcome, Stage1JobClaimOutcome::SkippedRunning) }), - "one concurrent claim should be throttled by running cap: {claim_outcomes:?}" - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn claim_stage1_jobs_filters_by_age_idle_and_current_thread() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let now = Utc::now(); - let fresh_at = now - Duration::hours(1); - let just_under_idle_at = now - Duration::hours(12) + Duration::minutes(1); - let eligible_idle_at = now - Duration::hours(12) - Duration::minutes(1); - let old_at = now - Duration::days(31); - - let current_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); - let fresh_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("fresh thread id"); - let just_under_idle_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("just under idle thread id"); - let eligible_idle_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("eligible idle thread id"); - let old_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("old thread id"); - - let mut current = - test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current")); - current.created_at = now; - current.updated_at = now; - runtime - .upsert_thread(¤t) - .await - .expect("upsert current"); - - let mut fresh = - test_thread_metadata(&codex_home, fresh_thread_id, codex_home.join("fresh")); - fresh.created_at = fresh_at; - fresh.updated_at = fresh_at; - runtime.upsert_thread(&fresh).await.expect("upsert fresh"); - - let mut just_under_idle = test_thread_metadata( - &codex_home, - just_under_idle_thread_id, - codex_home.join("just-under-idle"), - ); - just_under_idle.created_at = just_under_idle_at; - just_under_idle.updated_at = just_under_idle_at; - runtime - .upsert_thread(&just_under_idle) - .await - .expect("upsert just-under-idle"); - - let mut eligible_idle = test_thread_metadata( - &codex_home, - eligible_idle_thread_id, - codex_home.join("eligible-idle"), - ); - eligible_idle.created_at = eligible_idle_at; - eligible_idle.updated_at = eligible_idle_at; - runtime - .upsert_thread(&eligible_idle) - .await - .expect("upsert eligible-idle"); - - let mut old = test_thread_metadata(&codex_home, old_thread_id, codex_home.join("old")); - old.created_at = old_at; - old.updated_at = old_at; - runtime.upsert_thread(&old).await.expect("upsert old"); - - let allowed_sources = vec!["cli".to_string()]; - let claims = runtime - .claim_stage1_jobs_for_startup( - current_thread_id, - Stage1StartupClaimParams { - scan_limit: 1, - max_claimed: 5, - max_age_days: 30, - min_rollout_idle_hours: 12, - allowed_sources: allowed_sources.as_slice(), - lease_seconds: 3600, - }, - ) - .await - .expect("claim stage1 jobs"); - - assert_eq!(claims.len(), 1); - assert_eq!(claims[0].thread.id, eligible_idle_thread_id); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn claim_stage1_jobs_prefilters_threads_with_up_to_date_memory() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let now = Utc::now(); - let eligible_newer_at = now - Duration::hours(13); - let eligible_older_at = now - Duration::hours(14); - - let current_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); - let up_to_date_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("up-to-date thread id"); - let stale_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("stale thread id"); - let worker_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("worker id"); - - let mut current = - test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current")); - current.created_at = now; - current.updated_at = now; - runtime - .upsert_thread(¤t) - .await - .expect("upsert current thread"); - - let mut up_to_date = test_thread_metadata( - &codex_home, - up_to_date_thread_id, - codex_home.join("up-to-date"), - ); - up_to_date.created_at = eligible_newer_at; - up_to_date.updated_at = eligible_newer_at; - runtime - .upsert_thread(&up_to_date) - .await - .expect("upsert up-to-date thread"); - - let up_to_date_claim = runtime - .try_claim_stage1_job( - up_to_date_thread_id, - worker_id, - up_to_date.updated_at.timestamp(), - 3600, - 64, - ) - .await - .expect("claim up-to-date thread for seed"); - let up_to_date_token = match up_to_date_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected seed claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - up_to_date_thread_id, - up_to_date_token.as_str(), - up_to_date.updated_at.timestamp(), - "raw", - "summary", - None, - ) - .await - .expect("mark up-to-date thread succeeded"), - "seed stage1 success should complete for up-to-date thread" - ); - - let mut stale = - test_thread_metadata(&codex_home, stale_thread_id, codex_home.join("stale")); - stale.created_at = eligible_older_at; - stale.updated_at = eligible_older_at; - runtime - .upsert_thread(&stale) - .await - .expect("upsert stale thread"); - - let allowed_sources = vec!["cli".to_string()]; - let claims = runtime - .claim_stage1_jobs_for_startup( - current_thread_id, - Stage1StartupClaimParams { - scan_limit: 1, - max_claimed: 1, - max_age_days: 30, - min_rollout_idle_hours: 12, - allowed_sources: allowed_sources.as_slice(), - lease_seconds: 3600, - }, - ) - .await - .expect("claim stage1 startup jobs"); - assert_eq!(claims.len(), 1); - assert_eq!(claims[0].thread.id, stale_thread_id); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn claim_stage1_jobs_enforces_global_running_cap() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let current_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - current_thread_id, - codex_home.join("current"), - )) - .await - .expect("upsert current"); - - let now = Utc::now(); - let started_at = now.timestamp(); - let lease_until = started_at + 3600; - let eligible_at = now - Duration::hours(13); - let existing_running = 10usize; - let total_candidates = 80usize; - - for idx in 0..total_candidates { - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let mut metadata = test_thread_metadata( - &codex_home, - thread_id, - codex_home.join(format!("thread-{idx}")), - ); - metadata.created_at = eligible_at - Duration::seconds(idx as i64); - metadata.updated_at = eligible_at - Duration::seconds(idx as i64); - runtime - .upsert_thread(&metadata) - .await - .expect("upsert thread"); - - if idx < existing_running { - sqlx::query( - r#" -INSERT INTO jobs ( - kind, - job_key, - status, - worker_id, - ownership_token, - started_at, - finished_at, - lease_until, - retry_at, - retry_remaining, - last_error, - input_watermark, - last_success_watermark -) VALUES (?, ?, 'running', ?, ?, ?, NULL, ?, NULL, ?, NULL, ?, NULL) - "#, - ) - .bind("memory_stage1") - .bind(thread_id.to_string()) - .bind(current_thread_id.to_string()) - .bind(Uuid::new_v4().to_string()) - .bind(started_at) - .bind(lease_until) - .bind(3) - .bind(metadata.updated_at.timestamp()) - .execute(runtime.pool.as_ref()) - .await - .expect("seed running stage1 job"); - } - } - - let allowed_sources = vec!["cli".to_string()]; - let claims = runtime - .claim_stage1_jobs_for_startup( - current_thread_id, - Stage1StartupClaimParams { - scan_limit: 200, - max_claimed: 64, - max_age_days: 30, - min_rollout_idle_hours: 12, - allowed_sources: allowed_sources.as_slice(), - lease_seconds: 3600, - }, - ) - .await - .expect("claim stage1 jobs"); - assert_eq!(claims.len(), 54); - - let running_count = sqlx::query( - r#" -SELECT COUNT(*) AS count -FROM jobs -WHERE kind = 'memory_stage1' - AND status = 'running' - AND lease_until IS NOT NULL - AND lease_until > ? - "#, - ) - .bind(Utc::now().timestamp()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("count running stage1 jobs") - .try_get::("count") - .expect("running count value"); - assert_eq!(running_count, 64); - - let more_claims = runtime - .claim_stage1_jobs_for_startup( - current_thread_id, - Stage1StartupClaimParams { - scan_limit: 200, - max_claimed: 64, - max_age_days: 30, - min_rollout_idle_hours: 12, - allowed_sources: allowed_sources.as_slice(), - lease_seconds: 3600, - }, - ) - .await - .expect("claim stage1 jobs with cap reached"); - assert_eq!(more_claims.len(), 0); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn claim_stage1_jobs_processes_two_full_batches_across_startup_passes() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let current_thread_id = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); - let mut current = - test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current")); - current.created_at = Utc::now(); - current.updated_at = Utc::now(); - runtime - .upsert_thread(¤t) - .await - .expect("upsert current"); - - let eligible_at = Utc::now() - Duration::hours(13); - for idx in 0..200 { - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let mut metadata = test_thread_metadata( - &codex_home, - thread_id, - codex_home.join(format!("thread-{idx}")), - ); - metadata.created_at = eligible_at - Duration::seconds(idx as i64); - metadata.updated_at = eligible_at - Duration::seconds(idx as i64); - runtime - .upsert_thread(&metadata) - .await - .expect("upsert eligible thread"); - } - - let allowed_sources = vec!["cli".to_string()]; - let first_claims = runtime - .claim_stage1_jobs_for_startup( - current_thread_id, - Stage1StartupClaimParams { - scan_limit: 5_000, - max_claimed: 64, - max_age_days: 30, - min_rollout_idle_hours: 12, - allowed_sources: allowed_sources.as_slice(), - lease_seconds: 3_600, - }, - ) - .await - .expect("first stage1 startup claim"); - assert_eq!(first_claims.len(), 64); - - for claim in first_claims { - assert!( - runtime - .mark_stage1_job_succeeded( - claim.thread.id, - claim.ownership_token.as_str(), - claim.thread.updated_at.timestamp(), - "raw", - "summary", - None, - ) - .await - .expect("mark first-batch stage1 success"), - "first batch stage1 completion should succeed" - ); - } - - let second_claims = runtime - .claim_stage1_jobs_for_startup( - current_thread_id, - Stage1StartupClaimParams { - scan_limit: 5_000, - max_claimed: 64, - max_age_days: 30, - min_rollout_idle_hours: 12, - allowed_sources: allowed_sources.as_slice(), - lease_seconds: 3_600, - }, - ) - .await - .expect("second stage1 startup claim"); - assert_eq!(second_claims.len(), 64); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn stage1_output_cascades_on_thread_delete() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let cwd = codex_home.join("workspace"); - runtime - .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd)) - .await - .expect("upsert thread"); - - let claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) - .await - .expect("claim stage1"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - ownership_token.as_str(), - 100, - "raw", - "sum", - None, - ) - .await - .expect("mark stage1 succeeded"), - "mark stage1 succeeded should write stage1_outputs" - ); - - let count_before = - sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("count before delete") - .try_get::("count") - .expect("count value"); - assert_eq!(count_before, 1); - - sqlx::query("DELETE FROM threads WHERE id = ?") - .bind(thread_id.to_string()) - .execute(runtime.pool.as_ref()) - .await - .expect("delete thread"); - - let count_after = - sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("count after delete") - .try_get::("count") - .expect("count value"); - assert_eq!(count_after, 0); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn mark_stage1_job_succeeded_no_output_skips_phase2_when_output_was_already_absent() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - let claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) - .await - .expect("claim stage1"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded_no_output(thread_id, ownership_token.as_str()) - .await - .expect("mark stage1 succeeded without output"), - "stage1 no-output success should complete the job" - ); - - let output_row_count = - sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load stage1 output count") - .try_get::("count") - .expect("stage1 output count"); - assert_eq!( - output_row_count, 0, - "stage1 no-output success should not persist empty stage1 outputs" - ); - - let up_to_date = runtime - .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) - .await - .expect("claim stage1 up-to-date"); - assert_eq!(up_to_date, Stage1JobClaimOutcome::SkippedUpToDate); - - let global_job_row_count = sqlx::query("SELECT COUNT(*) AS count FROM jobs WHERE kind = ?") - .bind("memory_consolidate_global") - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load phase2 job row count") - .try_get::("count") - .expect("phase2 job row count"); - assert_eq!( - global_job_row_count, 0, - "no-output without an existing stage1 output should not enqueue phase2" - ); - - let claim_phase2 = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2"); - assert_eq!( - claim_phase2, - Phase2JobClaimOutcome::SkippedNotDirty, - "phase2 should remain clean when no-output deleted nothing" - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn mark_stage1_job_succeeded_no_output_enqueues_phase2_when_deleting_output() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - let first_claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) - .await - .expect("claim initial stage1"); - let first_token = match first_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected initial stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded(thread_id, first_token.as_str(), 100, "raw", "sum", None) - .await - .expect("mark initial stage1 succeeded"), - "initial stage1 success should create stage1 output" - ); - - let phase2_claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2 after initial output"); - let (phase2_token, phase2_input_watermark) = match phase2_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim after initial output: {other:?}"), - }; - assert_eq!(phase2_input_watermark, 100); - assert!( - runtime - .mark_global_phase2_job_succeeded( - phase2_token.as_str(), - phase2_input_watermark, - &[], - ) - .await - .expect("mark initial phase2 succeeded"), - "initial phase2 success should clear global dirty state" - ); - - let no_output_claim = runtime - .try_claim_stage1_job(thread_id, owner_b, 101, 3600, 64) - .await - .expect("claim stage1 for no-output delete"); - let no_output_token = match no_output_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected no-output stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded_no_output(thread_id, no_output_token.as_str()) - .await - .expect("mark stage1 no-output after existing output"), - "no-output should succeed when deleting an existing stage1 output" - ); - - let output_row_count = - sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load stage1 output count after delete") - .try_get::("count") - .expect("stage1 output count"); - assert_eq!(output_row_count, 0); - - let claim_phase2 = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2 after no-output deletion"); - let (phase2_token, phase2_input_watermark) = match claim_phase2 { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim after no-output deletion: {other:?}"), - }; - assert_eq!(phase2_input_watermark, 101); - assert!( - runtime - .mark_global_phase2_job_succeeded( - phase2_token.as_str(), - phase2_input_watermark, - &[], - ) - .await - .expect("mark phase2 succeeded after no-output delete") - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn stage1_retry_exhaustion_does_not_block_newer_watermark() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - for attempt in 0..3 { - let claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64) - .await - .expect("claim stage1 for retry exhaustion"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!( - "attempt {} should claim stage1 before retries are exhausted: {other:?}", - attempt + 1 - ), - }; - assert!( - runtime - .mark_stage1_job_failed(thread_id, ownership_token.as_str(), "boom", 0) - .await - .expect("mark stage1 failed"), - "attempt {} should decrement retry budget", - attempt + 1 - ); - } - - let exhausted_claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64) - .await - .expect("claim stage1 after retry exhaustion"); - assert_eq!( - exhausted_claim, - Stage1JobClaimOutcome::SkippedRetryExhausted - ); - - let newer_source_claim = runtime - .try_claim_stage1_job(thread_id, owner, 101, 3_600, 64) - .await - .expect("claim stage1 with newer source watermark"); - assert!( - matches!(newer_source_claim, Stage1JobClaimOutcome::Claimed { .. }), - "newer source watermark should reset retry budget and be claimable" - ); - - let job_row = sqlx::query( - "SELECT retry_remaining, input_watermark FROM jobs WHERE kind = ? AND job_key = ?", - ) - .bind("memory_stage1") - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load stage1 job row after newer-source claim"); - assert_eq!( - job_row - .try_get::("retry_remaining") - .expect("retry_remaining"), - 3 - ); - assert_eq!( - job_row - .try_get::("input_watermark") - .expect("input_watermark"), - 101 - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn phase2_global_consolidation_reruns_when_watermark_advances() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - - runtime - .enqueue_global_consolidation(100) - .await - .expect("enqueue global consolidation"); - - let claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2"); - let (ownership_token, input_watermark) = match claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_global_phase2_job_succeeded(ownership_token.as_str(), input_watermark, &[],) - .await - .expect("mark phase2 succeeded"), - "phase2 success should finalize for current token" - ); - - let claim_up_to_date = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2 up-to-date"); - assert_eq!(claim_up_to_date, Phase2JobClaimOutcome::SkippedNotDirty); - - runtime - .enqueue_global_consolidation(101) - .await - .expect("enqueue global consolidation again"); - - let claim_rerun = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2 rerun"); - assert!( - matches!(claim_rerun, Phase2JobClaimOutcome::Claimed { .. }), - "advanced watermark should be claimable" - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn list_stage1_outputs_for_global_returns_latest_outputs() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let thread_id_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id_a, - codex_home.join("workspace-a"), - )) - .await - .expect("upsert thread a"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id_b, - codex_home.join("workspace-b"), - )) - .await - .expect("upsert thread b"); - - let claim = runtime - .try_claim_stage1_job(thread_id_a, owner, 100, 3600, 64) - .await - .expect("claim stage1 a"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id_a, - ownership_token.as_str(), - 100, - "raw memory a", - "summary a", - None, - ) - .await - .expect("mark stage1 succeeded a"), - "stage1 success should persist output a" - ); - - let claim = runtime - .try_claim_stage1_job(thread_id_b, owner, 101, 3600, 64) - .await - .expect("claim stage1 b"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id_b, - ownership_token.as_str(), - 101, - "raw memory b", - "summary b", - Some("rollout-b"), - ) - .await - .expect("mark stage1 succeeded b"), - "stage1 success should persist output b" - ); - - let outputs = runtime - .list_stage1_outputs_for_global(10) - .await - .expect("list stage1 outputs for global"); - assert_eq!(outputs.len(), 2); - assert_eq!(outputs[0].thread_id, thread_id_b); - assert_eq!(outputs[0].rollout_summary, "summary b"); - assert_eq!(outputs[0].rollout_slug.as_deref(), Some("rollout-b")); - assert_eq!(outputs[0].cwd, codex_home.join("workspace-b")); - assert_eq!(outputs[1].thread_id, thread_id_a); - assert_eq!(outputs[1].rollout_summary, "summary a"); - assert_eq!(outputs[1].rollout_slug, None); - assert_eq!(outputs[1].cwd, codex_home.join("workspace-a")); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn list_stage1_outputs_for_global_skips_empty_payloads() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id_non_empty = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let thread_id_empty = - ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id_non_empty, - codex_home.join("workspace-non-empty"), - )) - .await - .expect("upsert non-empty thread"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id_empty, - codex_home.join("workspace-empty"), - )) - .await - .expect("upsert empty thread"); - - sqlx::query( - r#" -INSERT INTO stage1_outputs (thread_id, source_updated_at, raw_memory, rollout_summary, generated_at) -VALUES (?, ?, ?, ?, ?) - "#, - ) - .bind(thread_id_non_empty.to_string()) - .bind(100_i64) - .bind("raw memory") - .bind("summary") - .bind(100_i64) - .execute(runtime.pool.as_ref()) - .await - .expect("insert non-empty stage1 output"); - sqlx::query( - r#" -INSERT INTO stage1_outputs (thread_id, source_updated_at, raw_memory, rollout_summary, generated_at) -VALUES (?, ?, ?, ?, ?) - "#, - ) - .bind(thread_id_empty.to_string()) - .bind(101_i64) - .bind("") - .bind("") - .bind(101_i64) - .execute(runtime.pool.as_ref()) - .await - .expect("insert empty stage1 output"); - - let outputs = runtime - .list_stage1_outputs_for_global(1) - .await - .expect("list stage1 outputs for global"); - assert_eq!(outputs.len(), 1); - assert_eq!(outputs[0].thread_id, thread_id_non_empty); - assert_eq!(outputs[0].rollout_summary, "summary"); - assert_eq!(outputs[0].cwd, codex_home.join("workspace-non-empty")); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn get_phase2_input_selection_reports_added_retained_and_removed_rows() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let thread_id_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let thread_id_c = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - - for (thread_id, workspace) in [ - (thread_id_a, "workspace-a"), - (thread_id_b, "workspace-b"), - (thread_id_c, "workspace-c"), - ] { - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join(workspace), - )) - .await - .expect("upsert thread"); - } - - for (thread_id, updated_at, slug) in [ - (thread_id_a, 100, Some("rollout-a")), - (thread_id_b, 101, Some("rollout-b")), - (thread_id_c, 102, Some("rollout-c")), - ] { - let claim = runtime - .try_claim_stage1_job(thread_id, owner, updated_at, 3600, 64) - .await - .expect("claim stage1"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - ownership_token.as_str(), - updated_at, - &format!("raw-{updated_at}"), - &format!("summary-{updated_at}"), - slug, - ) - .await - .expect("mark stage1 succeeded"), - "stage1 success should persist output" - ); - } - - let claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2"); - let (ownership_token, input_watermark) = match claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim outcome: {other:?}"), - }; - assert_eq!(input_watermark, 102); - let selected_outputs = runtime - .list_stage1_outputs_for_global(10) - .await - .expect("list stage1 outputs for global") - .into_iter() - .filter(|output| output.thread_id == thread_id_c || output.thread_id == thread_id_a) - .collect::>(); - assert!( - runtime - .mark_global_phase2_job_succeeded( - ownership_token.as_str(), - input_watermark, - &selected_outputs, - ) - .await - .expect("mark phase2 success with selection"), - "phase2 success should persist selected rows" - ); - - let selection = runtime - .get_phase2_input_selection(2) - .await - .expect("load phase2 input selection"); - - assert_eq!(selection.selected.len(), 2); - assert_eq!(selection.previous_selected.len(), 2); - assert_eq!(selection.selected[0].thread_id, thread_id_c); - assert_eq!( - selection.selected[0].rollout_path, - codex_home.join(format!("rollout-{thread_id_c}.jsonl")) - ); - assert_eq!(selection.selected[1].thread_id, thread_id_b); - assert_eq!(selection.retained_thread_ids, vec![thread_id_c]); - - assert_eq!(selection.removed.len(), 1); - assert_eq!(selection.removed[0].thread_id, thread_id_a); - assert_eq!( - selection.removed[0].rollout_slug.as_deref(), - Some("rollout-a") - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn get_phase2_input_selection_treats_regenerated_selected_rows_as_added() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - let first_claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) - .await - .expect("claim initial stage1"); - let first_token = match first_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - first_token.as_str(), - 100, - "raw-100", - "summary-100", - Some("rollout-100"), - ) - .await - .expect("mark initial stage1 success"), - "initial stage1 success should persist output" - ); - - let phase2_claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2"); - let (phase2_token, input_watermark) = match phase2_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim outcome: {other:?}"), - }; - let selected_outputs = runtime - .list_stage1_outputs_for_global(1) - .await - .expect("list selected outputs"); - assert!( - runtime - .mark_global_phase2_job_succeeded( - phase2_token.as_str(), - input_watermark, - &selected_outputs, - ) - .await - .expect("mark phase2 success"), - "phase2 success should persist selected rows" - ); - - let refreshed_claim = runtime - .try_claim_stage1_job(thread_id, owner, 101, 3600, 64) - .await - .expect("claim refreshed stage1"); - let refreshed_token = match refreshed_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - refreshed_token.as_str(), - 101, - "raw-101", - "summary-101", - Some("rollout-101"), - ) - .await - .expect("mark refreshed stage1 success"), - "refreshed stage1 success should persist output" - ); - - let selection = runtime - .get_phase2_input_selection(1) - .await - .expect("load phase2 input selection"); - assert_eq!(selection.selected.len(), 1); - assert_eq!(selection.previous_selected.len(), 1); - assert_eq!(selection.selected[0].thread_id, thread_id); - assert_eq!(selection.selected[0].source_updated_at.timestamp(), 101); - assert!(selection.retained_thread_ids.is_empty()); - assert!(selection.removed.is_empty()); - - let (selected_for_phase2, selected_for_phase2_source_updated_at) = - sqlx::query_as::<_, (i64, Option)>( - "SELECT selected_for_phase2, selected_for_phase2_source_updated_at FROM stage1_outputs WHERE thread_id = ?", - ) - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load selected_for_phase2"); - assert_eq!(selected_for_phase2, 1); - assert_eq!(selected_for_phase2_source_updated_at, Some(100)); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn get_phase2_input_selection_reports_regenerated_previous_selection_as_removed() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread a"); - let thread_id_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread b"); - let thread_id_c = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread c"); - let thread_id_d = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread d"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - - for (thread_id, workspace) in [ - (thread_id_a, "workspace-a"), - (thread_id_b, "workspace-b"), - (thread_id_c, "workspace-c"), - (thread_id_d, "workspace-d"), - ] { - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join(workspace), - )) - .await - .expect("upsert thread"); - } - - for (thread_id, updated_at, slug) in [ - (thread_id_a, 100, Some("rollout-a-100")), - (thread_id_b, 101, Some("rollout-b-101")), - (thread_id_c, 99, Some("rollout-c-99")), - (thread_id_d, 98, Some("rollout-d-98")), - ] { - let claim = runtime - .try_claim_stage1_job(thread_id, owner, updated_at, 3600, 64) - .await - .expect("claim initial stage1"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - ownership_token.as_str(), - updated_at, - &format!("raw-{updated_at}"), - &format!("summary-{updated_at}"), - slug, - ) - .await - .expect("mark stage1 succeeded"), - "stage1 success should persist output" - ); - } - - let phase2_claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2"); - let (phase2_token, input_watermark) = match phase2_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim outcome: {other:?}"), - }; - let selected_outputs = runtime - .list_stage1_outputs_for_global(2) - .await - .expect("list selected outputs"); - assert_eq!( - selected_outputs - .iter() - .map(|output| output.thread_id) - .collect::>(), - vec![thread_id_b, thread_id_a] - ); - assert!( - runtime - .mark_global_phase2_job_succeeded( - phase2_token.as_str(), - input_watermark, - &selected_outputs, - ) - .await - .expect("mark phase2 success"), - "phase2 success should persist selected rows" - ); - - for (thread_id, updated_at, slug) in [ - (thread_id_a, 102, Some("rollout-a-102")), - (thread_id_c, 103, Some("rollout-c-103")), - (thread_id_d, 104, Some("rollout-d-104")), - ] { - let claim = runtime - .try_claim_stage1_job(thread_id, owner, updated_at, 3600, 64) - .await - .expect("claim refreshed stage1"); - let ownership_token = match claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - ownership_token.as_str(), - updated_at, - &format!("raw-{updated_at}"), - &format!("summary-{updated_at}"), - slug, - ) - .await - .expect("mark refreshed stage1 success"), - "refreshed stage1 success should persist output" - ); - } - - let selection = runtime - .get_phase2_input_selection(2) - .await - .expect("load phase2 input selection"); - assert_eq!( - selection - .selected - .iter() - .map(|output| output.thread_id) - .collect::>(), - vec![thread_id_d, thread_id_c] - ); - assert_eq!( - selection - .previous_selected - .iter() - .map(|output| output.thread_id) - .collect::>(), - vec![thread_id_a, thread_id_b] - ); - assert!(selection.retained_thread_ids.is_empty()); - assert_eq!( - selection - .removed - .iter() - .map(|output| (output.thread_id, output.source_updated_at.timestamp())) - .collect::>(), - vec![(thread_id_a, 102), (thread_id_b, 101)] - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn mark_global_phase2_job_succeeded_updates_selected_snapshot_timestamp() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - let initial_claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) - .await - .expect("claim initial stage1"); - let initial_token = match initial_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - initial_token.as_str(), - 100, - "raw-100", - "summary-100", - Some("rollout-100"), - ) - .await - .expect("mark initial stage1 success"), - "initial stage1 success should persist output" - ); - - let first_phase2_claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim first phase2"); - let (first_phase2_token, first_input_watermark) = match first_phase2_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected first phase2 claim outcome: {other:?}"), - }; - let first_selected_outputs = runtime - .list_stage1_outputs_for_global(1) - .await - .expect("list first selected outputs"); - assert!( - runtime - .mark_global_phase2_job_succeeded( - first_phase2_token.as_str(), - first_input_watermark, - &first_selected_outputs, - ) - .await - .expect("mark first phase2 success"), - "first phase2 success should persist selected rows" - ); - - let refreshed_claim = runtime - .try_claim_stage1_job(thread_id, owner, 101, 3600, 64) - .await - .expect("claim refreshed stage1"); - let refreshed_token = match refreshed_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected refreshed stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - refreshed_token.as_str(), - 101, - "raw-101", - "summary-101", - Some("rollout-101"), - ) - .await - .expect("mark refreshed stage1 success"), - "refreshed stage1 success should persist output" - ); - - let second_phase2_claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim second phase2"); - let (second_phase2_token, second_input_watermark) = match second_phase2_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected second phase2 claim outcome: {other:?}"), - }; - let second_selected_outputs = runtime - .list_stage1_outputs_for_global(1) - .await - .expect("list second selected outputs"); - assert_eq!( - second_selected_outputs[0].source_updated_at.timestamp(), - 101 - ); - assert!( - runtime - .mark_global_phase2_job_succeeded( - second_phase2_token.as_str(), - second_input_watermark, - &second_selected_outputs, - ) - .await - .expect("mark second phase2 success"), - "second phase2 success should persist selected rows" - ); - - let selection = runtime - .get_phase2_input_selection(1) - .await - .expect("load phase2 input selection after refresh"); - assert_eq!(selection.retained_thread_ids, vec![thread_id]); - - let (selected_for_phase2, selected_for_phase2_source_updated_at) = - sqlx::query_as::<_, (i64, Option)>( - "SELECT selected_for_phase2, selected_for_phase2_source_updated_at FROM stage1_outputs WHERE thread_id = ?", - ) - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load selected snapshot after phase2"); - assert_eq!(selected_for_phase2, 1); - assert_eq!(selected_for_phase2_source_updated_at, Some(101)); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn mark_global_phase2_job_succeeded_only_marks_exact_selected_snapshots() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_id, - codex_home.join("workspace"), - )) - .await - .expect("upsert thread"); - - let initial_claim = runtime - .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) - .await - .expect("claim initial stage1"); - let initial_token = match initial_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - initial_token.as_str(), - 100, - "raw-100", - "summary-100", - Some("rollout-100"), - ) - .await - .expect("mark initial stage1 success"), - "initial stage1 success should persist output" - ); - - let phase2_claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim phase2"); - let (phase2_token, input_watermark) = match phase2_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected phase2 claim outcome: {other:?}"), - }; - let selected_outputs = runtime - .list_stage1_outputs_for_global(1) - .await - .expect("list selected outputs"); - assert_eq!(selected_outputs[0].source_updated_at.timestamp(), 100); - - let refreshed_claim = runtime - .try_claim_stage1_job(thread_id, owner, 101, 3600, 64) - .await - .expect("claim refreshed stage1"); - let refreshed_token = match refreshed_claim { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_id, - refreshed_token.as_str(), - 101, - "raw-101", - "summary-101", - Some("rollout-101"), - ) - .await - .expect("mark refreshed stage1 success"), - "refreshed stage1 success should persist output" - ); - - assert!( - runtime - .mark_global_phase2_job_succeeded( - phase2_token.as_str(), - input_watermark, - &selected_outputs, - ) - .await - .expect("mark phase2 success"), - "phase2 success should still complete" - ); - - let (selected_for_phase2, selected_for_phase2_source_updated_at) = - sqlx::query_as::<_, (i64, Option)>( - "SELECT selected_for_phase2, selected_for_phase2_source_updated_at FROM stage1_outputs WHERE thread_id = ?", - ) - .bind(thread_id.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load selected_for_phase2"); - assert_eq!(selected_for_phase2, 0); - assert_eq!(selected_for_phase2_source_updated_at, None); - - let selection = runtime - .get_phase2_input_selection(1) - .await - .expect("load phase2 input selection"); - assert_eq!(selection.selected.len(), 1); - assert_eq!(selection.selected[0].source_updated_at.timestamp(), 101); - assert!(selection.retained_thread_ids.is_empty()); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn record_stage1_output_usage_updates_usage_metadata() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id a"); - let thread_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id b"); - let missing = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("missing id"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_a, - codex_home.join("workspace-a"), - )) - .await - .expect("upsert thread a"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_b, - codex_home.join("workspace-b"), - )) - .await - .expect("upsert thread b"); - - let claim_a = runtime - .try_claim_stage1_job(thread_a, owner, 100, 3600, 64) - .await - .expect("claim stage1 a"); - let token_a = match claim_a { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome for a: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded(thread_a, token_a.as_str(), 100, "raw a", "sum a", None) - .await - .expect("mark stage1 succeeded a") - ); - - let claim_b = runtime - .try_claim_stage1_job(thread_b, owner, 101, 3600, 64) - .await - .expect("claim stage1 b"); - let token_b = match claim_b { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome for b: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded(thread_b, token_b.as_str(), 101, "raw b", "sum b", None) - .await - .expect("mark stage1 succeeded b") - ); - - let updated_rows = runtime - .record_stage1_output_usage(&[thread_a, thread_a, thread_b, missing]) - .await - .expect("record stage1 output usage"); - assert_eq!(updated_rows, 3); - - let row_a = - sqlx::query("SELECT usage_count, last_usage FROM stage1_outputs WHERE thread_id = ?") - .bind(thread_a.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load stage1 usage row a"); - let row_b = - sqlx::query("SELECT usage_count, last_usage FROM stage1_outputs WHERE thread_id = ?") - .bind(thread_b.to_string()) - .fetch_one(runtime.pool.as_ref()) - .await - .expect("load stage1 usage row b"); - - assert_eq!( - row_a - .try_get::("usage_count") - .expect("usage_count a"), - 2 - ); - assert_eq!( - row_b - .try_get::("usage_count") - .expect("usage_count b"), - 1 - ); - - let last_usage_a = row_a.try_get::("last_usage").expect("last_usage a"); - let last_usage_b = row_b.try_get::("last_usage").expect("last_usage b"); - assert_eq!(last_usage_a, last_usage_b); - assert!(last_usage_a > 0); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn mark_stage1_job_succeeded_enqueues_global_consolidation() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let thread_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id a"); - let thread_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id b"); - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); - - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_a, - codex_home.join("workspace-a"), - )) - .await - .expect("upsert thread a"); - runtime - .upsert_thread(&test_thread_metadata( - &codex_home, - thread_b, - codex_home.join("workspace-b"), - )) - .await - .expect("upsert thread b"); - - let claim_a = runtime - .try_claim_stage1_job(thread_a, owner, 100, 3600, 64) - .await - .expect("claim stage1 a"); - let token_a = match claim_a { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome for thread a: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_a, - token_a.as_str(), - 100, - "raw-a", - "summary-a", - None, - ) - .await - .expect("mark stage1 succeeded a"), - "stage1 success should persist output for thread a" - ); - - let claim_b = runtime - .try_claim_stage1_job(thread_b, owner, 101, 3600, 64) - .await - .expect("claim stage1 b"); - let token_b = match claim_b { - Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, - other => panic!("unexpected stage1 claim outcome for thread b: {other:?}"), - }; - assert!( - runtime - .mark_stage1_job_succeeded( - thread_b, - token_b.as_str(), - 101, - "raw-b", - "summary-b", - None, - ) - .await - .expect("mark stage1 succeeded b"), - "stage1 success should persist output for thread b" - ); - - let claim = runtime - .try_claim_global_phase2_job(owner, 3600) - .await - .expect("claim global consolidation"); - let input_watermark = match claim { - Phase2JobClaimOutcome::Claimed { - input_watermark, .. - } => input_watermark, - other => panic!("unexpected global consolidation claim outcome: {other:?}"), - }; - assert_eq!(input_watermark, 101); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn phase2_global_lock_allows_only_one_fresh_runner() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - runtime - .enqueue_global_consolidation(200) - .await - .expect("enqueue global consolidation"); - - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b"); - - let running_claim = runtime - .try_claim_global_phase2_job(owner_a, 3600) - .await - .expect("claim global lock"); - assert!( - matches!(running_claim, Phase2JobClaimOutcome::Claimed { .. }), - "first owner should claim global lock" - ); - - let second_claim = runtime - .try_claim_global_phase2_job(owner_b, 3600) - .await - .expect("claim global lock from second owner"); - assert_eq!(second_claim, Phase2JobClaimOutcome::SkippedRunning); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn phase2_global_lock_stale_lease_allows_takeover() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - runtime - .enqueue_global_consolidation(300) - .await - .expect("enqueue global consolidation"); - - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a"); - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b"); - - let initial_claim = runtime - .try_claim_global_phase2_job(owner_a, 3600) - .await - .expect("claim initial global lock"); - let token_a = match initial_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, .. - } => ownership_token, - other => panic!("unexpected initial claim outcome: {other:?}"), - }; - - sqlx::query("UPDATE jobs SET lease_until = ? WHERE kind = ? AND job_key = ?") - .bind(Utc::now().timestamp() - 1) - .bind("memory_consolidate_global") - .bind("global") - .execute(runtime.pool.as_ref()) - .await - .expect("expire global consolidation lease"); - - let takeover_claim = runtime - .try_claim_global_phase2_job(owner_b, 3600) - .await - .expect("claim stale global lock"); - let (token_b, input_watermark) = match takeover_claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => (ownership_token, input_watermark), - other => panic!("unexpected takeover claim outcome: {other:?}"), - }; - assert_ne!(token_a, token_b); - assert_eq!(input_watermark, 300); - - assert_eq!( - runtime - .mark_global_phase2_job_succeeded(token_a.as_str(), 300, &[]) - .await - .expect("mark stale owner success result"), - false, - "stale owner should lose finalization ownership after takeover" - ); - assert!( - runtime - .mark_global_phase2_job_succeeded(token_b.as_str(), 300, &[]) - .await - .expect("mark takeover owner success"), - "takeover owner should finalize consolidation" - ); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn phase2_backfilled_inputs_below_last_success_still_become_dirty() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - runtime - .enqueue_global_consolidation(500) - .await - .expect("enqueue initial consolidation"); - let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a"); - let claim_a = runtime - .try_claim_global_phase2_job(owner_a, 3_600) - .await - .expect("claim initial consolidation"); - let token_a = match claim_a { - Phase2JobClaimOutcome::Claimed { - ownership_token, - input_watermark, - } => { - assert_eq!(input_watermark, 500); - ownership_token - } - other => panic!("unexpected initial phase2 claim outcome: {other:?}"), - }; - assert!( - runtime - .mark_global_phase2_job_succeeded(token_a.as_str(), 500, &[]) - .await - .expect("mark initial phase2 success"), - "initial phase2 success should finalize" - ); - - runtime - .enqueue_global_consolidation(400) - .await - .expect("enqueue backfilled consolidation"); - - let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b"); - let claim_b = runtime - .try_claim_global_phase2_job(owner_b, 3_600) - .await - .expect("claim backfilled consolidation"); - match claim_b { - Phase2JobClaimOutcome::Claimed { - input_watermark, .. - } => { - assert!( - input_watermark > 500, - "backfilled enqueue should advance dirty watermark beyond last success" - ); - } - other => panic!("unexpected backfilled phase2 claim outcome: {other:?}"), - } - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn phase2_failure_fallback_updates_unowned_running_job() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - runtime - .enqueue_global_consolidation(400) - .await - .expect("enqueue global consolidation"); - - let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner"); - let claim = runtime - .try_claim_global_phase2_job(owner, 3_600) - .await - .expect("claim global consolidation"); - let ownership_token = match claim { - Phase2JobClaimOutcome::Claimed { - ownership_token, .. - } => ownership_token, - other => panic!("unexpected claim outcome: {other:?}"), - }; - - sqlx::query("UPDATE jobs SET ownership_token = NULL WHERE kind = ? AND job_key = ?") - .bind("memory_consolidate_global") - .bind("global") - .execute(runtime.pool.as_ref()) - .await - .expect("clear ownership token"); - - assert_eq!( - runtime - .mark_global_phase2_job_failed(ownership_token.as_str(), "lost", 3_600) - .await - .expect("mark phase2 failed with strict ownership"), - false, - "strict failure update should not match unowned running job" - ); - assert!( - runtime - .mark_global_phase2_job_failed_if_unowned(ownership_token.as_str(), "lost", 3_600) - .await - .expect("fallback failure update should match unowned running job"), - "fallback failure update should transition the unowned running job" - ); - - let claim = runtime - .try_claim_global_phase2_job(ThreadId::new(), 3_600) - .await - .expect("claim after fallback failure"); - assert_eq!(claim, Phase2JobClaimOutcome::SkippedNotDirty); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn query_logs_with_search_matches_substring() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - runtime - .insert_logs(&[ - LogEntry { - ts: 1_700_000_001, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some("alpha".to_string()), - thread_id: Some("thread-1".to_string()), - process_uuid: None, - file: Some("main.rs".to_string()), - line: Some(42), - module_path: None, - }, - LogEntry { - ts: 1_700_000_002, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some("alphabet".to_string()), - thread_id: Some("thread-1".to_string()), - process_uuid: None, - file: Some("main.rs".to_string()), - line: Some(43), - module_path: None, - }, - ]) - .await - .expect("insert test logs"); - - let rows = runtime - .query_logs(&LogQuery { - search: Some("alphab".to_string()), - ..Default::default() - }) - .await - .expect("query matching logs"); - - assert_eq!(rows.len(), 1); - assert_eq!(rows[0].message.as_deref(), Some("alphabet")); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn insert_logs_prunes_old_rows_when_thread_exceeds_size_limit() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let six_mebibytes = "a".repeat(6 * 1024 * 1024); - runtime - .insert_logs(&[ - LogEntry { - ts: 1, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes.clone()), - thread_id: Some("thread-1".to_string()), - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(1), - module_path: Some("mod".to_string()), - }, - LogEntry { - ts: 2, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes.clone()), - thread_id: Some("thread-1".to_string()), - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(2), - module_path: Some("mod".to_string()), - }, - ]) - .await - .expect("insert test logs"); - - let rows = runtime - .query_logs(&LogQuery { - thread_ids: vec!["thread-1".to_string()], - ..Default::default() - }) - .await - .expect("query thread logs"); - - assert_eq!(rows.len(), 1); - assert_eq!(rows[0].ts, 2); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn insert_logs_prunes_single_thread_row_when_it_exceeds_size_limit() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let eleven_mebibytes = "d".repeat(11 * 1024 * 1024); - runtime - .insert_logs(&[LogEntry { - ts: 1, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(eleven_mebibytes), - thread_id: Some("thread-oversized".to_string()), - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(1), - module_path: Some("mod".to_string()), - }]) - .await - .expect("insert test log"); - - let rows = runtime - .query_logs(&LogQuery { - thread_ids: vec!["thread-oversized".to_string()], - ..Default::default() - }) - .await - .expect("query thread logs"); - - assert!(rows.is_empty()); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn insert_logs_prunes_threadless_rows_per_process_uuid_only() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let six_mebibytes = "b".repeat(6 * 1024 * 1024); - runtime - .insert_logs(&[ - LogEntry { - ts: 1, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes.clone()), - thread_id: None, - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(1), - module_path: Some("mod".to_string()), - }, - LogEntry { - ts: 2, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes.clone()), - thread_id: None, - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(2), - module_path: Some("mod".to_string()), - }, - LogEntry { - ts: 3, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes), - thread_id: Some("thread-1".to_string()), - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(3), - module_path: Some("mod".to_string()), - }, - ]) - .await - .expect("insert test logs"); - - let rows = runtime - .query_logs(&LogQuery { - thread_ids: vec!["thread-1".to_string()], - include_threadless: true, - ..Default::default() - }) - .await - .expect("query thread and threadless logs"); - - let mut timestamps: Vec = rows.into_iter().map(|row| row.ts).collect(); - timestamps.sort_unstable(); - assert_eq!(timestamps, vec![2, 3]); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn insert_logs_prunes_single_threadless_process_row_when_it_exceeds_size_limit() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let eleven_mebibytes = "e".repeat(11 * 1024 * 1024); - runtime - .insert_logs(&[LogEntry { - ts: 1, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(eleven_mebibytes), - thread_id: None, - process_uuid: Some("proc-oversized".to_string()), - file: Some("main.rs".to_string()), - line: Some(1), - module_path: Some("mod".to_string()), - }]) - .await - .expect("insert test log"); - - let rows = runtime - .query_logs(&LogQuery { - include_threadless: true, - ..Default::default() - }) - .await - .expect("query threadless logs"); - - assert!(rows.is_empty()); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn insert_logs_prunes_threadless_rows_with_null_process_uuid() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let six_mebibytes = "c".repeat(6 * 1024 * 1024); - runtime - .insert_logs(&[ - LogEntry { - ts: 1, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes.clone()), - thread_id: None, - process_uuid: None, - file: Some("main.rs".to_string()), - line: Some(1), - module_path: Some("mod".to_string()), - }, - LogEntry { - ts: 2, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(six_mebibytes), - thread_id: None, - process_uuid: None, - file: Some("main.rs".to_string()), - line: Some(2), - module_path: Some("mod".to_string()), - }, - LogEntry { - ts: 3, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some("small".to_string()), - thread_id: None, - process_uuid: Some("proc-1".to_string()), - file: Some("main.rs".to_string()), - line: Some(3), - module_path: Some("mod".to_string()), - }, - ]) - .await - .expect("insert test logs"); - - let rows = runtime - .query_logs(&LogQuery { - include_threadless: true, - ..Default::default() - }) - .await - .expect("query threadless logs"); - - let mut timestamps: Vec = rows.into_iter().map(|row| row.ts).collect(); - timestamps.sort_unstable(); - assert_eq!(timestamps, vec![2, 3]); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - #[tokio::test] - async fn insert_logs_prunes_single_threadless_null_process_row_when_it_exceeds_limit() { - let codex_home = unique_temp_dir(); - let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) - .await - .expect("initialize runtime"); - - let eleven_mebibytes = "f".repeat(11 * 1024 * 1024); - runtime - .insert_logs(&[LogEntry { - ts: 1, - ts_nanos: 0, - level: "INFO".to_string(), - target: "cli".to_string(), - message: Some(eleven_mebibytes), - thread_id: None, - process_uuid: None, - file: Some("main.rs".to_string()), - line: Some(1), - module_path: Some("mod".to_string()), - }]) - .await - .expect("insert test log"); - - let rows = runtime - .query_logs(&LogQuery { - include_threadless: true, - ..Default::default() - }) - .await - .expect("query threadless logs"); - - assert!(rows.is_empty()); - - let _ = tokio::fs::remove_dir_all(codex_home).await; - } - - fn test_thread_metadata( - codex_home: &Path, - thread_id: ThreadId, - cwd: PathBuf, - ) -> ThreadMetadata { - let now = DateTime::::from_timestamp(1_700_000_000, 0).expect("timestamp"); - ThreadMetadata { - id: thread_id, - rollout_path: codex_home.join(format!("rollout-{thread_id}.jsonl")), - created_at: now, - updated_at: now, - source: "cli".to_string(), - agent_nickname: None, - agent_role: None, - model_provider: "test-provider".to_string(), - cwd, - cli_version: "0.0.0".to_string(), - title: String::new(), - sandbox_policy: crate::extract::enum_to_string(&SandboxPolicy::new_read_only_policy()), - approval_mode: crate::extract::enum_to_string(&AskForApproval::OnRequest), - tokens_used: 0, - first_user_message: Some("hello".to_string()), - archived_at: None, - git_sha: None, - git_branch: None, - git_origin_url: None, - } - } -} diff --git a/codex-rs/state/src/runtime/agent_jobs.rs b/codex-rs/state/src/runtime/agent_jobs.rs new file mode 100644 index 000000000..c68560594 --- /dev/null +++ b/codex-rs/state/src/runtime/agent_jobs.rs @@ -0,0 +1,562 @@ +use super::*; +use crate::model::AgentJobItemRow; + +impl StateRuntime { + pub async fn create_agent_job( + &self, + params: &AgentJobCreateParams, + items: &[AgentJobItemCreateParams], + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let input_headers_json = serde_json::to_string(¶ms.input_headers)?; + let output_schema_json = params + .output_schema_json + .as_ref() + .map(serde_json::to_string) + .transpose()?; + let max_runtime_seconds = params + .max_runtime_seconds + .map(i64::try_from) + .transpose() + .map_err(|_| anyhow::anyhow!("invalid max_runtime_seconds value"))?; + let mut tx = self.pool.begin().await?; + sqlx::query( + r#" +INSERT INTO agent_jobs ( + id, + name, + status, + instruction, + auto_export, + max_runtime_seconds, + output_schema_json, + input_headers_json, + input_csv_path, + output_csv_path, + created_at, + updated_at, + started_at, + completed_at, + last_error +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL) + "#, + ) + .bind(params.id.as_str()) + .bind(params.name.as_str()) + .bind(AgentJobStatus::Pending.as_str()) + .bind(params.instruction.as_str()) + .bind(i64::from(params.auto_export)) + .bind(max_runtime_seconds) + .bind(output_schema_json) + .bind(input_headers_json) + .bind(params.input_csv_path.as_str()) + .bind(params.output_csv_path.as_str()) + .bind(now) + .bind(now) + .execute(&mut *tx) + .await?; + + for item in items { + let row_json = serde_json::to_string(&item.row_json)?; + sqlx::query( + r#" +INSERT INTO agent_job_items ( + job_id, + item_id, + row_index, + source_id, + row_json, + status, + assigned_thread_id, + attempt_count, + result_json, + last_error, + created_at, + updated_at, + completed_at, + reported_at +) VALUES (?, ?, ?, ?, ?, ?, NULL, 0, NULL, NULL, ?, ?, NULL, NULL) + "#, + ) + .bind(params.id.as_str()) + .bind(item.item_id.as_str()) + .bind(item.row_index) + .bind(item.source_id.as_deref()) + .bind(row_json) + .bind(AgentJobItemStatus::Pending.as_str()) + .bind(now) + .bind(now) + .execute(&mut *tx) + .await?; + } + + tx.commit().await?; + + let job_id = params.id.as_str(); + self.get_agent_job(job_id) + .await? + .ok_or_else(|| anyhow::anyhow!("failed to load created agent job {job_id}")) + } + + pub async fn get_agent_job(&self, job_id: &str) -> anyhow::Result> { + let row = sqlx::query_as::<_, AgentJobRow>( + r#" +SELECT + id, + name, + status, + instruction, + auto_export, + max_runtime_seconds, + output_schema_json, + input_headers_json, + input_csv_path, + output_csv_path, + created_at, + updated_at, + started_at, + completed_at, + last_error +FROM agent_jobs +WHERE id = ? + "#, + ) + .bind(job_id) + .fetch_optional(self.pool.as_ref()) + .await?; + row.map(AgentJob::try_from).transpose() + } + + pub async fn list_agent_job_items( + &self, + job_id: &str, + status: Option, + limit: Option, + ) -> anyhow::Result> { + let mut builder = QueryBuilder::::new( + r#" +SELECT + job_id, + item_id, + row_index, + source_id, + row_json, + status, + assigned_thread_id, + attempt_count, + result_json, + last_error, + created_at, + updated_at, + completed_at, + reported_at +FROM agent_job_items +WHERE job_id = + "#, + ); + builder.push_bind(job_id); + if let Some(status) = status { + builder.push(" AND status = "); + builder.push_bind(status.as_str()); + } + builder.push(" ORDER BY row_index ASC"); + if let Some(limit) = limit { + builder.push(" LIMIT "); + builder.push_bind(limit as i64); + } + let rows: Vec = builder + .build_query_as::() + .fetch_all(self.pool.as_ref()) + .await?; + rows.into_iter().map(AgentJobItem::try_from).collect() + } + + pub async fn get_agent_job_item( + &self, + job_id: &str, + item_id: &str, + ) -> anyhow::Result> { + let row: Option = sqlx::query_as::<_, AgentJobItemRow>( + r#" +SELECT + job_id, + item_id, + row_index, + source_id, + row_json, + status, + assigned_thread_id, + attempt_count, + result_json, + last_error, + created_at, + updated_at, + completed_at, + reported_at +FROM agent_job_items +WHERE job_id = ? AND item_id = ? + "#, + ) + .bind(job_id) + .bind(item_id) + .fetch_optional(self.pool.as_ref()) + .await?; + row.map(AgentJobItem::try_from).transpose() + } + + pub async fn mark_agent_job_running(&self, job_id: &str) -> anyhow::Result<()> { + let now = Utc::now().timestamp(); + sqlx::query( + r#" +UPDATE agent_jobs +SET + status = ?, + updated_at = ?, + started_at = COALESCE(started_at, ?), + completed_at = NULL, + last_error = NULL +WHERE id = ? + "#, + ) + .bind(AgentJobStatus::Running.as_str()) + .bind(now) + .bind(now) + .bind(job_id) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + pub async fn mark_agent_job_completed(&self, job_id: &str) -> anyhow::Result<()> { + let now = Utc::now().timestamp(); + sqlx::query( + r#" +UPDATE agent_jobs +SET status = ?, updated_at = ?, completed_at = ?, last_error = NULL +WHERE id = ? + "#, + ) + .bind(AgentJobStatus::Completed.as_str()) + .bind(now) + .bind(now) + .bind(job_id) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + pub async fn mark_agent_job_failed( + &self, + job_id: &str, + error_message: &str, + ) -> anyhow::Result<()> { + let now = Utc::now().timestamp(); + sqlx::query( + r#" +UPDATE agent_jobs +SET status = ?, updated_at = ?, completed_at = ?, last_error = ? +WHERE id = ? + "#, + ) + .bind(AgentJobStatus::Failed.as_str()) + .bind(now) + .bind(now) + .bind(error_message) + .bind(job_id) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + pub async fn mark_agent_job_cancelled( + &self, + job_id: &str, + reason: &str, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_jobs +SET status = ?, updated_at = ?, completed_at = ?, last_error = ? +WHERE id = ? AND status IN (?, ?) + "#, + ) + .bind(AgentJobStatus::Cancelled.as_str()) + .bind(now) + .bind(now) + .bind(reason) + .bind(job_id) + .bind(AgentJobStatus::Pending.as_str()) + .bind(AgentJobStatus::Running.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn is_agent_job_cancelled(&self, job_id: &str) -> anyhow::Result { + let row = sqlx::query( + r#" +SELECT status +FROM agent_jobs +WHERE id = ? + "#, + ) + .bind(job_id) + .fetch_optional(self.pool.as_ref()) + .await?; + let Some(row) = row else { + return Ok(false); + }; + let status: String = row.try_get("status")?; + Ok(AgentJobStatus::parse(status.as_str())? == AgentJobStatus::Cancelled) + } + + pub async fn mark_agent_job_item_running( + &self, + job_id: &str, + item_id: &str, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET + status = ?, + assigned_thread_id = NULL, + attempt_count = attempt_count + 1, + updated_at = ?, + last_error = NULL +WHERE job_id = ? AND item_id = ? AND status = ? + "#, + ) + .bind(AgentJobItemStatus::Running.as_str()) + .bind(now) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Pending.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn mark_agent_job_item_running_with_thread( + &self, + job_id: &str, + item_id: &str, + thread_id: &str, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET + status = ?, + assigned_thread_id = ?, + attempt_count = attempt_count + 1, + updated_at = ?, + last_error = NULL +WHERE job_id = ? AND item_id = ? AND status = ? + "#, + ) + .bind(AgentJobItemStatus::Running.as_str()) + .bind(thread_id) + .bind(now) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Pending.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn mark_agent_job_item_pending( + &self, + job_id: &str, + item_id: &str, + error_message: Option<&str>, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET + status = ?, + assigned_thread_id = NULL, + updated_at = ?, + last_error = ? +WHERE job_id = ? AND item_id = ? AND status = ? + "#, + ) + .bind(AgentJobItemStatus::Pending.as_str()) + .bind(now) + .bind(error_message) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Running.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn set_agent_job_item_thread( + &self, + job_id: &str, + item_id: &str, + thread_id: &str, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET assigned_thread_id = ?, updated_at = ? +WHERE job_id = ? AND item_id = ? AND status = ? + "#, + ) + .bind(thread_id) + .bind(now) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Running.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn report_agent_job_item_result( + &self, + job_id: &str, + item_id: &str, + reporting_thread_id: &str, + result_json: &Value, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let serialized = serde_json::to_string(result_json)?; + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET + result_json = ?, + reported_at = ?, + updated_at = ?, + last_error = NULL +WHERE + job_id = ? + AND item_id = ? + AND status = ? + AND assigned_thread_id = ? + "#, + ) + .bind(serialized) + .bind(now) + .bind(now) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Running.as_str()) + .bind(reporting_thread_id) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn mark_agent_job_item_completed( + &self, + job_id: &str, + item_id: &str, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET + status = ?, + completed_at = ?, + updated_at = ?, + assigned_thread_id = NULL +WHERE + job_id = ? + AND item_id = ? + AND status = ? + AND result_json IS NOT NULL + "#, + ) + .bind(AgentJobItemStatus::Completed.as_str()) + .bind(now) + .bind(now) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Running.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn mark_agent_job_item_failed( + &self, + job_id: &str, + item_id: &str, + error_message: &str, + ) -> anyhow::Result { + let now = Utc::now().timestamp(); + let result = sqlx::query( + r#" +UPDATE agent_job_items +SET + status = ?, + completed_at = ?, + updated_at = ?, + last_error = ?, + assigned_thread_id = NULL +WHERE + job_id = ? + AND item_id = ? + AND status = ? + "#, + ) + .bind(AgentJobItemStatus::Failed.as_str()) + .bind(now) + .bind(now) + .bind(error_message) + .bind(job_id) + .bind(item_id) + .bind(AgentJobItemStatus::Running.as_str()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() > 0) + } + + pub async fn get_agent_job_progress(&self, job_id: &str) -> anyhow::Result { + let row = sqlx::query( + r#" +SELECT + COUNT(*) AS total_items, + SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS pending_items, + SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS running_items, + SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS completed_items, + SUM(CASE WHEN status = ? THEN 1 ELSE 0 END) AS failed_items +FROM agent_job_items +WHERE job_id = ? + "#, + ) + .bind(AgentJobItemStatus::Pending.as_str()) + .bind(AgentJobItemStatus::Running.as_str()) + .bind(AgentJobItemStatus::Completed.as_str()) + .bind(AgentJobItemStatus::Failed.as_str()) + .bind(job_id) + .fetch_one(self.pool.as_ref()) + .await?; + + let total_items: i64 = row.try_get("total_items")?; + let pending_items: Option = row.try_get("pending_items")?; + let running_items: Option = row.try_get("running_items")?; + let completed_items: Option = row.try_get("completed_items")?; + let failed_items: Option = row.try_get("failed_items")?; + Ok(AgentJobProgress { + total_items: usize::try_from(total_items).unwrap_or_default(), + pending_items: usize::try_from(pending_items.unwrap_or_default()).unwrap_or_default(), + running_items: usize::try_from(running_items.unwrap_or_default()).unwrap_or_default(), + completed_items: usize::try_from(completed_items.unwrap_or_default()) + .unwrap_or_default(), + failed_items: usize::try_from(failed_items.unwrap_or_default()).unwrap_or_default(), + }) + } +} diff --git a/codex-rs/state/src/runtime/backfill.rs b/codex-rs/state/src/runtime/backfill.rs new file mode 100644 index 000000000..93e156987 --- /dev/null +++ b/codex-rs/state/src/runtime/backfill.rs @@ -0,0 +1,311 @@ +use super::*; + +impl StateRuntime { + pub async fn get_backfill_state(&self) -> anyhow::Result { + self.ensure_backfill_state_row().await?; + let row = sqlx::query( + r#" +SELECT status, last_watermark, last_success_at +FROM backfill_state +WHERE id = 1 + "#, + ) + .fetch_one(self.pool.as_ref()) + .await?; + crate::BackfillState::try_from_row(&row) + } + + /// Attempt to claim ownership of rollout metadata backfill. + /// + /// Returns `true` when this runtime claimed the backfill worker slot. + /// Returns `false` if backfill is already complete or currently owned by a + /// non-expired worker. + pub async fn try_claim_backfill(&self, lease_seconds: i64) -> anyhow::Result { + self.ensure_backfill_state_row().await?; + let now = Utc::now().timestamp(); + let lease_cutoff = now.saturating_sub(lease_seconds.max(0)); + let result = sqlx::query( + r#" +UPDATE backfill_state +SET status = ?, updated_at = ? +WHERE id = 1 + AND status != ? + AND (status != ? OR updated_at <= ?) + "#, + ) + .bind(crate::BackfillStatus::Running.as_str()) + .bind(now) + .bind(crate::BackfillStatus::Complete.as_str()) + .bind(crate::BackfillStatus::Running.as_str()) + .bind(lease_cutoff) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected() == 1) + } + + /// Mark rollout metadata backfill as running. + pub async fn mark_backfill_running(&self) -> anyhow::Result<()> { + self.ensure_backfill_state_row().await?; + sqlx::query( + r#" +UPDATE backfill_state +SET status = ?, updated_at = ? +WHERE id = 1 + "#, + ) + .bind(crate::BackfillStatus::Running.as_str()) + .bind(Utc::now().timestamp()) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + /// Persist rollout metadata backfill progress. + pub async fn checkpoint_backfill(&self, watermark: &str) -> anyhow::Result<()> { + self.ensure_backfill_state_row().await?; + sqlx::query( + r#" +UPDATE backfill_state +SET status = ?, last_watermark = ?, updated_at = ? +WHERE id = 1 + "#, + ) + .bind(crate::BackfillStatus::Running.as_str()) + .bind(watermark) + .bind(Utc::now().timestamp()) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + /// Mark rollout metadata backfill as complete. + pub async fn mark_backfill_complete(&self, last_watermark: Option<&str>) -> anyhow::Result<()> { + self.ensure_backfill_state_row().await?; + let now = Utc::now().timestamp(); + sqlx::query( + r#" +UPDATE backfill_state +SET + status = ?, + last_watermark = COALESCE(?, last_watermark), + last_success_at = ?, + updated_at = ? +WHERE id = 1 + "#, + ) + .bind(crate::BackfillStatus::Complete.as_str()) + .bind(last_watermark) + .bind(now) + .bind(now) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + async fn ensure_backfill_state_row(&self) -> anyhow::Result<()> { + sqlx::query( + r#" +INSERT INTO backfill_state (id, status, last_watermark, last_success_at, updated_at) +VALUES (?, ?, NULL, NULL, ?) +ON CONFLICT(id) DO NOTHING + "#, + ) + .bind(1_i64) + .bind(crate::BackfillStatus::Pending.as_str()) + .bind(Utc::now().timestamp()) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::StateRuntime; + use super::state_db_filename; + use super::test_support::unique_temp_dir; + use crate::STATE_DB_FILENAME; + use crate::STATE_DB_VERSION; + use chrono::Utc; + use pretty_assertions::assert_eq; + + #[tokio::test] + async fn init_removes_legacy_state_db_files() { + let codex_home = unique_temp_dir(); + tokio::fs::create_dir_all(&codex_home) + .await + .expect("create codex_home"); + + let current_name = state_db_filename(); + let previous_version = STATE_DB_VERSION.saturating_sub(1); + let unversioned_name = format!("{STATE_DB_FILENAME}.sqlite"); + for suffix in ["", "-wal", "-shm", "-journal"] { + let path = codex_home.join(format!("{unversioned_name}{suffix}")); + tokio::fs::write(path, b"legacy") + .await + .expect("write legacy"); + let old_version_path = codex_home.join(format!( + "{STATE_DB_FILENAME}_{previous_version}.sqlite{suffix}" + )); + tokio::fs::write(old_version_path, b"old_version") + .await + .expect("write old version"); + } + let unrelated_path = codex_home.join("state.sqlite_backup"); + tokio::fs::write(&unrelated_path, b"keep") + .await + .expect("write unrelated"); + let numeric_path = codex_home.join("123"); + tokio::fs::write(&numeric_path, b"keep") + .await + .expect("write numeric"); + + let _runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + for suffix in ["", "-wal", "-shm", "-journal"] { + let legacy_path = codex_home.join(format!("{unversioned_name}{suffix}")); + assert_eq!( + tokio::fs::try_exists(&legacy_path) + .await + .expect("check legacy path"), + false + ); + let old_version_path = codex_home.join(format!( + "{STATE_DB_FILENAME}_{previous_version}.sqlite{suffix}" + )); + assert_eq!( + tokio::fs::try_exists(&old_version_path) + .await + .expect("check old version path"), + false + ); + } + assert_eq!( + tokio::fs::try_exists(codex_home.join(current_name)) + .await + .expect("check new db path"), + true + ); + assert_eq!( + tokio::fs::try_exists(&unrelated_path) + .await + .expect("check unrelated path"), + true + ); + assert_eq!( + tokio::fs::try_exists(&numeric_path) + .await + .expect("check numeric path"), + true + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn backfill_state_persists_progress_and_completion() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let initial = runtime + .get_backfill_state() + .await + .expect("get initial backfill state"); + assert_eq!(initial.status, crate::BackfillStatus::Pending); + assert_eq!(initial.last_watermark, None); + assert_eq!(initial.last_success_at, None); + + runtime + .mark_backfill_running() + .await + .expect("mark backfill running"); + runtime + .checkpoint_backfill("sessions/2026/01/27/rollout-a.jsonl") + .await + .expect("checkpoint backfill"); + + let running = runtime + .get_backfill_state() + .await + .expect("get running backfill state"); + assert_eq!(running.status, crate::BackfillStatus::Running); + assert_eq!( + running.last_watermark, + Some("sessions/2026/01/27/rollout-a.jsonl".to_string()) + ); + assert_eq!(running.last_success_at, None); + + runtime + .mark_backfill_complete(Some("sessions/2026/01/28/rollout-b.jsonl")) + .await + .expect("mark backfill complete"); + let completed = runtime + .get_backfill_state() + .await + .expect("get completed backfill state"); + assert_eq!(completed.status, crate::BackfillStatus::Complete); + assert_eq!( + completed.last_watermark, + Some("sessions/2026/01/28/rollout-b.jsonl".to_string()) + ); + assert!(completed.last_success_at.is_some()); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn backfill_claim_is_singleton_until_stale_and_blocked_when_complete() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let claimed = runtime + .try_claim_backfill(3600) + .await + .expect("initial backfill claim"); + assert_eq!(claimed, true); + + let duplicate_claim = runtime + .try_claim_backfill(3600) + .await + .expect("duplicate backfill claim"); + assert_eq!(duplicate_claim, false); + + let stale_updated_at = Utc::now().timestamp().saturating_sub(10_000); + sqlx::query( + r#" +UPDATE backfill_state +SET status = ?, updated_at = ? +WHERE id = 1 + "#, + ) + .bind(crate::BackfillStatus::Running.as_str()) + .bind(stale_updated_at) + .execute(runtime.pool.as_ref()) + .await + .expect("force stale backfill lease"); + + let stale_claim = runtime + .try_claim_backfill(10) + .await + .expect("stale backfill claim"); + assert_eq!(stale_claim, true); + + runtime + .mark_backfill_complete(None) + .await + .expect("mark complete"); + let claim_after_complete = runtime + .try_claim_backfill(3600) + .await + .expect("claim after complete"); + assert_eq!(claim_after_complete, false); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } +} diff --git a/codex-rs/state/src/runtime/logs.rs b/codex-rs/state/src/runtime/logs.rs new file mode 100644 index 000000000..762f68f1a --- /dev/null +++ b/codex-rs/state/src/runtime/logs.rs @@ -0,0 +1,715 @@ +use super::*; + +impl StateRuntime { + pub async fn insert_log(&self, entry: &LogEntry) -> anyhow::Result<()> { + self.insert_logs(std::slice::from_ref(entry)).await + } + + /// Insert a batch of log entries into the logs table. + pub async fn insert_logs(&self, entries: &[LogEntry]) -> anyhow::Result<()> { + if entries.is_empty() { + return Ok(()); + } + + let mut tx = self.pool.begin().await?; + let mut builder = QueryBuilder::::new( + "INSERT INTO logs (ts, ts_nanos, level, target, message, thread_id, process_uuid, module_path, file, line, estimated_bytes) ", + ); + builder.push_values(entries, |mut row, entry| { + let estimated_bytes = entry.message.as_ref().map_or(0, String::len) as i64 + + entry.level.len() as i64 + + entry.target.len() as i64 + + entry.module_path.as_ref().map_or(0, String::len) as i64 + + entry.file.as_ref().map_or(0, String::len) as i64; + row.push_bind(entry.ts) + .push_bind(entry.ts_nanos) + .push_bind(&entry.level) + .push_bind(&entry.target) + .push_bind(&entry.message) + .push_bind(&entry.thread_id) + .push_bind(&entry.process_uuid) + .push_bind(&entry.module_path) + .push_bind(&entry.file) + .push_bind(entry.line) + .push_bind(estimated_bytes); + }); + builder.build().execute(&mut *tx).await?; + self.prune_logs_after_insert(entries, &mut tx).await?; + tx.commit().await?; + Ok(()) + } + + /// Enforce per-partition log size caps after a successful batch insert. + /// + /// We maintain two independent budgets: + /// - Thread logs: rows with `thread_id IS NOT NULL`, capped per `thread_id`. + /// - Threadless process logs: rows with `thread_id IS NULL` ("threadless"), + /// capped per `process_uuid` (including `process_uuid IS NULL` as its own + /// threadless partition). + /// + /// "Threadless" means the log row is not associated with any conversation + /// thread, so retention is keyed by process identity instead. + /// + /// This runs inside the same transaction as the insert so callers never + /// observe "inserted but not yet pruned" rows. + async fn prune_logs_after_insert( + &self, + entries: &[LogEntry], + tx: &mut SqliteConnection, + ) -> anyhow::Result<()> { + let thread_ids: BTreeSet<&str> = entries + .iter() + .filter_map(|entry| entry.thread_id.as_deref()) + .collect(); + if !thread_ids.is_empty() { + // Cheap precheck: only run the heavier window-function prune for + // threads that are currently above the cap. + let mut over_limit_threads_query = + QueryBuilder::::new("SELECT thread_id FROM logs WHERE thread_id IN ("); + { + let mut separated = over_limit_threads_query.separated(", "); + for thread_id in &thread_ids { + separated.push_bind(*thread_id); + } + } + over_limit_threads_query.push(") GROUP BY thread_id HAVING SUM("); + over_limit_threads_query.push("estimated_bytes"); + over_limit_threads_query.push(") > "); + over_limit_threads_query.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); + let over_limit_thread_ids: Vec = over_limit_threads_query + .build() + .fetch_all(&mut *tx) + .await? + .into_iter() + .map(|row| row.try_get("thread_id")) + .collect::>()?; + if !over_limit_thread_ids.is_empty() { + // Enforce a strict per-thread cap by deleting every row whose + // newest-first cumulative bytes exceed the partition budget. + let mut prune_threads = QueryBuilder::::new( + r#" +DELETE FROM logs +WHERE id IN ( + SELECT id + FROM ( + SELECT + id, + SUM( +"#, + ); + prune_threads.push("estimated_bytes"); + prune_threads.push( + r#" + ) OVER ( + PARTITION BY thread_id + ORDER BY ts DESC, ts_nanos DESC, id DESC + ) AS cumulative_bytes + FROM logs + WHERE thread_id IN ( +"#, + ); + { + let mut separated = prune_threads.separated(", "); + for thread_id in &over_limit_thread_ids { + separated.push_bind(thread_id); + } + } + prune_threads.push( + r#" + ) + ) + WHERE cumulative_bytes > +"#, + ); + prune_threads.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); + prune_threads.push("\n)"); + prune_threads.build().execute(&mut *tx).await?; + } + } + + let threadless_process_uuids: BTreeSet<&str> = entries + .iter() + .filter(|entry| entry.thread_id.is_none()) + .filter_map(|entry| entry.process_uuid.as_deref()) + .collect(); + let has_threadless_null_process_uuid = entries + .iter() + .any(|entry| entry.thread_id.is_none() && entry.process_uuid.is_none()); + if !threadless_process_uuids.is_empty() { + // Threadless logs are budgeted separately per process UUID. + let mut over_limit_processes_query = QueryBuilder::::new( + "SELECT process_uuid FROM logs WHERE thread_id IS NULL AND process_uuid IN (", + ); + { + let mut separated = over_limit_processes_query.separated(", "); + for process_uuid in &threadless_process_uuids { + separated.push_bind(*process_uuid); + } + } + over_limit_processes_query.push(") GROUP BY process_uuid HAVING SUM("); + over_limit_processes_query.push("estimated_bytes"); + over_limit_processes_query.push(") > "); + over_limit_processes_query.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); + let over_limit_process_uuids: Vec = over_limit_processes_query + .build() + .fetch_all(&mut *tx) + .await? + .into_iter() + .map(|row| row.try_get("process_uuid")) + .collect::>()?; + if !over_limit_process_uuids.is_empty() { + // Same strict cap policy as thread pruning, but only for + // threadless rows in the affected process UUIDs. + let mut prune_threadless_process_logs = QueryBuilder::::new( + r#" +DELETE FROM logs +WHERE id IN ( + SELECT id + FROM ( + SELECT + id, + SUM( +"#, + ); + prune_threadless_process_logs.push("estimated_bytes"); + prune_threadless_process_logs.push( + r#" + ) OVER ( + PARTITION BY process_uuid + ORDER BY ts DESC, ts_nanos DESC, id DESC + ) AS cumulative_bytes + FROM logs + WHERE thread_id IS NULL + AND process_uuid IN ( +"#, + ); + { + let mut separated = prune_threadless_process_logs.separated(", "); + for process_uuid in &over_limit_process_uuids { + separated.push_bind(process_uuid); + } + } + prune_threadless_process_logs.push( + r#" + ) + ) + WHERE cumulative_bytes > +"#, + ); + prune_threadless_process_logs.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); + prune_threadless_process_logs.push("\n)"); + prune_threadless_process_logs + .build() + .execute(&mut *tx) + .await?; + } + } + if has_threadless_null_process_uuid { + // Rows without a process UUID still need a cap; treat NULL as its + // own threadless partition. + let mut null_process_usage_query = QueryBuilder::::new("SELECT SUM("); + null_process_usage_query.push("estimated_bytes"); + null_process_usage_query.push( + ") AS total_bytes FROM logs WHERE thread_id IS NULL AND process_uuid IS NULL", + ); + let total_null_process_bytes: Option = null_process_usage_query + .build() + .fetch_one(&mut *tx) + .await? + .try_get("total_bytes")?; + + if total_null_process_bytes.unwrap_or(0) > LOG_PARTITION_SIZE_LIMIT_BYTES { + let mut prune_threadless_null_process_logs = QueryBuilder::::new( + r#" +DELETE FROM logs +WHERE id IN ( + SELECT id + FROM ( + SELECT + id, + SUM( +"#, + ); + prune_threadless_null_process_logs.push("estimated_bytes"); + prune_threadless_null_process_logs.push( + r#" + ) OVER ( + PARTITION BY process_uuid + ORDER BY ts DESC, ts_nanos DESC, id DESC + ) AS cumulative_bytes + FROM logs + WHERE thread_id IS NULL + AND process_uuid IS NULL + ) + WHERE cumulative_bytes > +"#, + ); + prune_threadless_null_process_logs.push_bind(LOG_PARTITION_SIZE_LIMIT_BYTES); + prune_threadless_null_process_logs.push("\n)"); + prune_threadless_null_process_logs + .build() + .execute(&mut *tx) + .await?; + } + } + Ok(()) + } + + pub(crate) async fn delete_logs_before(&self, cutoff_ts: i64) -> anyhow::Result { + let result = sqlx::query("DELETE FROM logs WHERE ts < ?") + .bind(cutoff_ts) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected()) + } + + /// Query logs with optional filters. + pub async fn query_logs(&self, query: &LogQuery) -> anyhow::Result> { + let mut builder = QueryBuilder::::new( + "SELECT id, ts, ts_nanos, level, target, message, thread_id, process_uuid, file, line FROM logs WHERE 1 = 1", + ); + push_log_filters(&mut builder, query); + if query.descending { + builder.push(" ORDER BY id DESC"); + } else { + builder.push(" ORDER BY id ASC"); + } + if let Some(limit) = query.limit { + builder.push(" LIMIT ").push_bind(limit as i64); + } + + let rows = builder + .build_query_as::() + .fetch_all(self.pool.as_ref()) + .await?; + Ok(rows) + } + + /// Return the max log id matching optional filters. + pub async fn max_log_id(&self, query: &LogQuery) -> anyhow::Result { + let mut builder = + QueryBuilder::::new("SELECT MAX(id) AS max_id FROM logs WHERE 1 = 1"); + push_log_filters(&mut builder, query); + let row = builder.build().fetch_one(self.pool.as_ref()).await?; + let max_id: Option = row.try_get("max_id")?; + Ok(max_id.unwrap_or(0)) + } +} + +fn push_log_filters<'a>(builder: &mut QueryBuilder<'a, Sqlite>, query: &'a LogQuery) { + if let Some(level_upper) = query.level_upper.as_ref() { + builder + .push(" AND UPPER(level) = ") + .push_bind(level_upper.as_str()); + } + if let Some(from_ts) = query.from_ts { + builder.push(" AND ts >= ").push_bind(from_ts); + } + if let Some(to_ts) = query.to_ts { + builder.push(" AND ts <= ").push_bind(to_ts); + } + push_like_filters(builder, "module_path", &query.module_like); + push_like_filters(builder, "file", &query.file_like); + let has_thread_filter = !query.thread_ids.is_empty() || query.include_threadless; + if has_thread_filter { + builder.push(" AND ("); + let mut needs_or = false; + for thread_id in &query.thread_ids { + if needs_or { + builder.push(" OR "); + } + builder.push("thread_id = ").push_bind(thread_id.as_str()); + needs_or = true; + } + if query.include_threadless { + if needs_or { + builder.push(" OR "); + } + builder.push("thread_id IS NULL"); + } + builder.push(")"); + } + if let Some(after_id) = query.after_id { + builder.push(" AND id > ").push_bind(after_id); + } + if let Some(search) = query.search.as_ref() { + builder.push(" AND INSTR(message, "); + builder.push_bind(search.as_str()); + builder.push(") > 0"); + } +} + +fn push_like_filters<'a>( + builder: &mut QueryBuilder<'a, Sqlite>, + column: &str, + filters: &'a [String], +) { + if filters.is_empty() { + return; + } + builder.push(" AND ("); + for (idx, filter) in filters.iter().enumerate() { + if idx > 0 { + builder.push(" OR "); + } + builder + .push(column) + .push(" LIKE '%' || ") + .push_bind(filter.as_str()) + .push(" || '%'"); + } + builder.push(")"); +} + +#[cfg(test)] +mod tests { + use super::StateRuntime; + use super::test_support::unique_temp_dir; + use crate::LogEntry; + use crate::LogQuery; + use pretty_assertions::assert_eq; + #[tokio::test] + async fn query_logs_with_search_matches_substring() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + runtime + .insert_logs(&[ + LogEntry { + ts: 1_700_000_001, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some("alpha".to_string()), + thread_id: Some("thread-1".to_string()), + process_uuid: None, + file: Some("main.rs".to_string()), + line: Some(42), + module_path: None, + }, + LogEntry { + ts: 1_700_000_002, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some("alphabet".to_string()), + thread_id: Some("thread-1".to_string()), + process_uuid: None, + file: Some("main.rs".to_string()), + line: Some(43), + module_path: None, + }, + ]) + .await + .expect("insert test logs"); + + let rows = runtime + .query_logs(&LogQuery { + search: Some("alphab".to_string()), + ..Default::default() + }) + .await + .expect("query matching logs"); + + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].message.as_deref(), Some("alphabet")); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn insert_logs_prunes_old_rows_when_thread_exceeds_size_limit() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let six_mebibytes = "a".repeat(6 * 1024 * 1024); + runtime + .insert_logs(&[ + LogEntry { + ts: 1, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes.clone()), + thread_id: Some("thread-1".to_string()), + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(1), + module_path: Some("mod".to_string()), + }, + LogEntry { + ts: 2, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes.clone()), + thread_id: Some("thread-1".to_string()), + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(2), + module_path: Some("mod".to_string()), + }, + ]) + .await + .expect("insert test logs"); + + let rows = runtime + .query_logs(&LogQuery { + thread_ids: vec!["thread-1".to_string()], + ..Default::default() + }) + .await + .expect("query thread logs"); + + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].ts, 2); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn insert_logs_prunes_single_thread_row_when_it_exceeds_size_limit() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let eleven_mebibytes = "d".repeat(11 * 1024 * 1024); + runtime + .insert_logs(&[LogEntry { + ts: 1, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(eleven_mebibytes), + thread_id: Some("thread-oversized".to_string()), + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(1), + module_path: Some("mod".to_string()), + }]) + .await + .expect("insert test log"); + + let rows = runtime + .query_logs(&LogQuery { + thread_ids: vec!["thread-oversized".to_string()], + ..Default::default() + }) + .await + .expect("query thread logs"); + + assert!(rows.is_empty()); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn insert_logs_prunes_threadless_rows_per_process_uuid_only() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let six_mebibytes = "b".repeat(6 * 1024 * 1024); + runtime + .insert_logs(&[ + LogEntry { + ts: 1, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes.clone()), + thread_id: None, + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(1), + module_path: Some("mod".to_string()), + }, + LogEntry { + ts: 2, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes.clone()), + thread_id: None, + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(2), + module_path: Some("mod".to_string()), + }, + LogEntry { + ts: 3, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes), + thread_id: Some("thread-1".to_string()), + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(3), + module_path: Some("mod".to_string()), + }, + ]) + .await + .expect("insert test logs"); + + let rows = runtime + .query_logs(&LogQuery { + thread_ids: vec!["thread-1".to_string()], + include_threadless: true, + ..Default::default() + }) + .await + .expect("query thread and threadless logs"); + + let mut timestamps: Vec = rows.into_iter().map(|row| row.ts).collect(); + timestamps.sort_unstable(); + assert_eq!(timestamps, vec![2, 3]); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn insert_logs_prunes_single_threadless_process_row_when_it_exceeds_size_limit() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let eleven_mebibytes = "e".repeat(11 * 1024 * 1024); + runtime + .insert_logs(&[LogEntry { + ts: 1, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(eleven_mebibytes), + thread_id: None, + process_uuid: Some("proc-oversized".to_string()), + file: Some("main.rs".to_string()), + line: Some(1), + module_path: Some("mod".to_string()), + }]) + .await + .expect("insert test log"); + + let rows = runtime + .query_logs(&LogQuery { + include_threadless: true, + ..Default::default() + }) + .await + .expect("query threadless logs"); + + assert!(rows.is_empty()); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn insert_logs_prunes_threadless_rows_with_null_process_uuid() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let six_mebibytes = "c".repeat(6 * 1024 * 1024); + runtime + .insert_logs(&[ + LogEntry { + ts: 1, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes.clone()), + thread_id: None, + process_uuid: None, + file: Some("main.rs".to_string()), + line: Some(1), + module_path: Some("mod".to_string()), + }, + LogEntry { + ts: 2, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(six_mebibytes), + thread_id: None, + process_uuid: None, + file: Some("main.rs".to_string()), + line: Some(2), + module_path: Some("mod".to_string()), + }, + LogEntry { + ts: 3, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some("small".to_string()), + thread_id: None, + process_uuid: Some("proc-1".to_string()), + file: Some("main.rs".to_string()), + line: Some(3), + module_path: Some("mod".to_string()), + }, + ]) + .await + .expect("insert test logs"); + + let rows = runtime + .query_logs(&LogQuery { + include_threadless: true, + ..Default::default() + }) + .await + .expect("query threadless logs"); + + let mut timestamps: Vec = rows.into_iter().map(|row| row.ts).collect(); + timestamps.sort_unstable(); + assert_eq!(timestamps, vec![2, 3]); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn insert_logs_prunes_single_threadless_null_process_row_when_it_exceeds_limit() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let eleven_mebibytes = "f".repeat(11 * 1024 * 1024); + runtime + .insert_logs(&[LogEntry { + ts: 1, + ts_nanos: 0, + level: "INFO".to_string(), + target: "cli".to_string(), + message: Some(eleven_mebibytes), + thread_id: None, + process_uuid: None, + file: Some("main.rs".to_string()), + line: Some(1), + module_path: Some("mod".to_string()), + }]) + .await + .expect("insert test log"); + + let rows = runtime + .query_logs(&LogQuery { + include_threadless: true, + ..Default::default() + }) + .await + .expect("query threadless logs"); + + assert!(rows.is_empty()); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } +} diff --git a/codex-rs/state/src/runtime/memories.rs b/codex-rs/state/src/runtime/memories.rs index 859ad0d14..e9b1e714f 100644 --- a/codex-rs/state/src/runtime/memories.rs +++ b/codex-rs/state/src/runtime/memories.rs @@ -1,3 +1,5 @@ +use super::threads::push_thread_filters; +use super::threads::push_thread_order_and_limit; use super::*; use crate::model::Phase2InputSelection; use crate::model::Phase2JobClaimOutcome; @@ -13,6 +15,7 @@ use sqlx::Executor; use sqlx::QueryBuilder; use sqlx::Sqlite; use std::collections::HashSet; +use uuid::Uuid; const JOB_KIND_MEMORY_STAGE1: &str = "memory_stage1"; const JOB_KIND_MEMORY_CONSOLIDATE_GLOBAL: &str = "memory_consolidate_global"; @@ -1130,3 +1133,2252 @@ ON CONFLICT(kind, job_key) DO UPDATE SET Ok(()) } + +#[cfg(test)] +mod tests { + use super::StateRuntime; + use super::test_support::test_thread_metadata; + use super::test_support::unique_temp_dir; + use crate::model::Phase2JobClaimOutcome; + use crate::model::Stage1JobClaimOutcome; + use crate::model::Stage1StartupClaimParams; + use chrono::Duration; + use chrono::Utc; + use codex_protocol::ThreadId; + use pretty_assertions::assert_eq; + use sqlx::Row; + use std::sync::Arc; + use uuid::Uuid; + + #[tokio::test] + async fn stage1_claim_skips_when_up_to_date() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let metadata = test_thread_metadata(&codex_home, thread_id, codex_home.join("a")); + runtime + .upsert_thread(&metadata) + .await + .expect("upsert thread"); + + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + + let claim = runtime + .try_claim_stage1_job(thread_id, owner_a, 100, 3600, 64) + .await + .expect("claim stage1 job"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected claim outcome: {other:?}"), + }; + + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + ownership_token.as_str(), + 100, + "raw", + "sum", + None, + ) + .await + .expect("mark stage1 succeeded"), + "stage1 success should finalize for current token" + ); + + let up_to_date = runtime + .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) + .await + .expect("claim stage1 up-to-date"); + assert_eq!(up_to_date, Stage1JobClaimOutcome::SkippedUpToDate); + + let needs_rerun = runtime + .try_claim_stage1_job(thread_id, owner_b, 101, 3600, 64) + .await + .expect("claim stage1 newer source"); + assert!( + matches!(needs_rerun, Stage1JobClaimOutcome::Claimed { .. }), + "newer source_updated_at should be claimable" + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn stage1_running_stale_can_be_stolen_but_fresh_running_is_skipped() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let cwd = codex_home.join("workspace"); + runtime + .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd)) + .await + .expect("upsert thread"); + + let claim_a = runtime + .try_claim_stage1_job(thread_id, owner_a, 100, 3600, 64) + .await + .expect("claim a"); + assert!(matches!(claim_a, Stage1JobClaimOutcome::Claimed { .. })); + + let claim_b_fresh = runtime + .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) + .await + .expect("claim b fresh"); + assert_eq!(claim_b_fresh, Stage1JobClaimOutcome::SkippedRunning); + + sqlx::query("UPDATE jobs SET lease_until = 0 WHERE kind = 'memory_stage1' AND job_key = ?") + .bind(thread_id.to_string()) + .execute(runtime.pool.as_ref()) + .await + .expect("force stale lease"); + + let claim_b_stale = runtime + .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) + .await + .expect("claim b stale"); + assert!(matches!( + claim_b_stale, + Stage1JobClaimOutcome::Claimed { .. } + )); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn stage1_concurrent_claim_for_same_thread_is_conflict_safe() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let thread_id_a = thread_id; + let thread_id_b = thread_id; + let runtime_a = Arc::clone(&runtime); + let runtime_b = Arc::clone(&runtime); + let claim_with_retry = |runtime: Arc, + thread_id: ThreadId, + owner: ThreadId| async move { + for attempt in 0..5 { + match runtime + .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64) + .await + { + Ok(outcome) => return outcome, + Err(err) if err.to_string().contains("database is locked") && attempt < 4 => { + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + } + Err(err) => panic!("claim stage1 should not fail: {err}"), + } + } + panic!("claim stage1 should have returned within retry budget") + }; + + let (claim_a, claim_b) = tokio::join!( + claim_with_retry(runtime_a, thread_id_a, owner_a), + claim_with_retry(runtime_b, thread_id_b, owner_b), + ); + + let claim_outcomes = vec![claim_a, claim_b]; + let claimed_count = claim_outcomes + .iter() + .filter(|outcome| matches!(outcome, Stage1JobClaimOutcome::Claimed { .. })) + .count(); + assert_eq!(claimed_count, 1); + assert!( + claim_outcomes.iter().all(|outcome| { + matches!( + outcome, + Stage1JobClaimOutcome::Claimed { .. } | Stage1JobClaimOutcome::SkippedRunning + ) + }), + "unexpected claim outcomes: {claim_outcomes:?}" + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn stage1_concurrent_claims_respect_running_cap() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let thread_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_a, + codex_home.join("workspace-a"), + )) + .await + .expect("upsert thread a"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_b, + codex_home.join("workspace-b"), + )) + .await + .expect("upsert thread b"); + + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let runtime_a = Arc::clone(&runtime); + let runtime_b = Arc::clone(&runtime); + + let (claim_a, claim_b) = tokio::join!( + async move { + runtime_a + .try_claim_stage1_job(thread_a, owner_a, 100, 3_600, 1) + .await + .expect("claim stage1 thread a") + }, + async move { + runtime_b + .try_claim_stage1_job(thread_b, owner_b, 101, 3_600, 1) + .await + .expect("claim stage1 thread b") + }, + ); + + let claim_outcomes = vec![claim_a, claim_b]; + let claimed_count = claim_outcomes + .iter() + .filter(|outcome| matches!(outcome, Stage1JobClaimOutcome::Claimed { .. })) + .count(); + assert_eq!(claimed_count, 1); + assert!( + claim_outcomes + .iter() + .any(|outcome| { matches!(outcome, Stage1JobClaimOutcome::SkippedRunning) }), + "one concurrent claim should be throttled by running cap: {claim_outcomes:?}" + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn claim_stage1_jobs_filters_by_age_idle_and_current_thread() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let now = Utc::now(); + let fresh_at = now - Duration::hours(1); + let just_under_idle_at = now - Duration::hours(12) + Duration::minutes(1); + let eligible_idle_at = now - Duration::hours(12) - Duration::minutes(1); + let old_at = now - Duration::days(31); + + let current_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); + let fresh_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("fresh thread id"); + let just_under_idle_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("just under idle thread id"); + let eligible_idle_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("eligible idle thread id"); + let old_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("old thread id"); + + let mut current = + test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current")); + current.created_at = now; + current.updated_at = now; + runtime + .upsert_thread(¤t) + .await + .expect("upsert current"); + + let mut fresh = + test_thread_metadata(&codex_home, fresh_thread_id, codex_home.join("fresh")); + fresh.created_at = fresh_at; + fresh.updated_at = fresh_at; + runtime.upsert_thread(&fresh).await.expect("upsert fresh"); + + let mut just_under_idle = test_thread_metadata( + &codex_home, + just_under_idle_thread_id, + codex_home.join("just-under-idle"), + ); + just_under_idle.created_at = just_under_idle_at; + just_under_idle.updated_at = just_under_idle_at; + runtime + .upsert_thread(&just_under_idle) + .await + .expect("upsert just-under-idle"); + + let mut eligible_idle = test_thread_metadata( + &codex_home, + eligible_idle_thread_id, + codex_home.join("eligible-idle"), + ); + eligible_idle.created_at = eligible_idle_at; + eligible_idle.updated_at = eligible_idle_at; + runtime + .upsert_thread(&eligible_idle) + .await + .expect("upsert eligible-idle"); + + let mut old = test_thread_metadata(&codex_home, old_thread_id, codex_home.join("old")); + old.created_at = old_at; + old.updated_at = old_at; + runtime.upsert_thread(&old).await.expect("upsert old"); + + let allowed_sources = vec!["cli".to_string()]; + let claims = runtime + .claim_stage1_jobs_for_startup( + current_thread_id, + Stage1StartupClaimParams { + scan_limit: 1, + max_claimed: 5, + max_age_days: 30, + min_rollout_idle_hours: 12, + allowed_sources: allowed_sources.as_slice(), + lease_seconds: 3600, + }, + ) + .await + .expect("claim stage1 jobs"); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].thread.id, eligible_idle_thread_id); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn claim_stage1_jobs_prefilters_threads_with_up_to_date_memory() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let now = Utc::now(); + let eligible_newer_at = now - Duration::hours(13); + let eligible_older_at = now - Duration::hours(14); + + let current_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); + let up_to_date_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("up-to-date thread id"); + let stale_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("stale thread id"); + let worker_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("worker id"); + + let mut current = + test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current")); + current.created_at = now; + current.updated_at = now; + runtime + .upsert_thread(¤t) + .await + .expect("upsert current thread"); + + let mut up_to_date = test_thread_metadata( + &codex_home, + up_to_date_thread_id, + codex_home.join("up-to-date"), + ); + up_to_date.created_at = eligible_newer_at; + up_to_date.updated_at = eligible_newer_at; + runtime + .upsert_thread(&up_to_date) + .await + .expect("upsert up-to-date thread"); + + let up_to_date_claim = runtime + .try_claim_stage1_job( + up_to_date_thread_id, + worker_id, + up_to_date.updated_at.timestamp(), + 3600, + 64, + ) + .await + .expect("claim up-to-date thread for seed"); + let up_to_date_token = match up_to_date_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected seed claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + up_to_date_thread_id, + up_to_date_token.as_str(), + up_to_date.updated_at.timestamp(), + "raw", + "summary", + None, + ) + .await + .expect("mark up-to-date thread succeeded"), + "seed stage1 success should complete for up-to-date thread" + ); + + let mut stale = + test_thread_metadata(&codex_home, stale_thread_id, codex_home.join("stale")); + stale.created_at = eligible_older_at; + stale.updated_at = eligible_older_at; + runtime + .upsert_thread(&stale) + .await + .expect("upsert stale thread"); + + let allowed_sources = vec!["cli".to_string()]; + let claims = runtime + .claim_stage1_jobs_for_startup( + current_thread_id, + Stage1StartupClaimParams { + scan_limit: 1, + max_claimed: 1, + max_age_days: 30, + min_rollout_idle_hours: 12, + allowed_sources: allowed_sources.as_slice(), + lease_seconds: 3600, + }, + ) + .await + .expect("claim stage1 startup jobs"); + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].thread.id, stale_thread_id); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn claim_stage1_jobs_enforces_global_running_cap() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let current_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + current_thread_id, + codex_home.join("current"), + )) + .await + .expect("upsert current"); + + let now = Utc::now(); + let started_at = now.timestamp(); + let lease_until = started_at + 3600; + let eligible_at = now - Duration::hours(13); + let existing_running = 10usize; + let total_candidates = 80usize; + + for idx in 0..total_candidates { + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let mut metadata = test_thread_metadata( + &codex_home, + thread_id, + codex_home.join(format!("thread-{idx}")), + ); + metadata.created_at = eligible_at - Duration::seconds(idx as i64); + metadata.updated_at = eligible_at - Duration::seconds(idx as i64); + runtime + .upsert_thread(&metadata) + .await + .expect("upsert thread"); + + if idx < existing_running { + sqlx::query( + r#" +INSERT INTO jobs ( + kind, + job_key, + status, + worker_id, + ownership_token, + started_at, + finished_at, + lease_until, + retry_at, + retry_remaining, + last_error, + input_watermark, + last_success_watermark +) VALUES (?, ?, 'running', ?, ?, ?, NULL, ?, NULL, ?, NULL, ?, NULL) + "#, + ) + .bind("memory_stage1") + .bind(thread_id.to_string()) + .bind(current_thread_id.to_string()) + .bind(Uuid::new_v4().to_string()) + .bind(started_at) + .bind(lease_until) + .bind(3) + .bind(metadata.updated_at.timestamp()) + .execute(runtime.pool.as_ref()) + .await + .expect("seed running stage1 job"); + } + } + + let allowed_sources = vec!["cli".to_string()]; + let claims = runtime + .claim_stage1_jobs_for_startup( + current_thread_id, + Stage1StartupClaimParams { + scan_limit: 200, + max_claimed: 64, + max_age_days: 30, + min_rollout_idle_hours: 12, + allowed_sources: allowed_sources.as_slice(), + lease_seconds: 3600, + }, + ) + .await + .expect("claim stage1 jobs"); + assert_eq!(claims.len(), 54); + + let running_count = sqlx::query( + r#" +SELECT COUNT(*) AS count +FROM jobs +WHERE kind = 'memory_stage1' + AND status = 'running' + AND lease_until IS NOT NULL + AND lease_until > ? + "#, + ) + .bind(Utc::now().timestamp()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("count running stage1 jobs") + .try_get::("count") + .expect("running count value"); + assert_eq!(running_count, 64); + + let more_claims = runtime + .claim_stage1_jobs_for_startup( + current_thread_id, + Stage1StartupClaimParams { + scan_limit: 200, + max_claimed: 64, + max_age_days: 30, + min_rollout_idle_hours: 12, + allowed_sources: allowed_sources.as_slice(), + lease_seconds: 3600, + }, + ) + .await + .expect("claim stage1 jobs with cap reached"); + assert_eq!(more_claims.len(), 0); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn claim_stage1_jobs_processes_two_full_batches_across_startup_passes() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let current_thread_id = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id"); + let mut current = + test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current")); + current.created_at = Utc::now(); + current.updated_at = Utc::now(); + runtime + .upsert_thread(¤t) + .await + .expect("upsert current"); + + let eligible_at = Utc::now() - Duration::hours(13); + for idx in 0..200 { + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let mut metadata = test_thread_metadata( + &codex_home, + thread_id, + codex_home.join(format!("thread-{idx}")), + ); + metadata.created_at = eligible_at - Duration::seconds(idx as i64); + metadata.updated_at = eligible_at - Duration::seconds(idx as i64); + runtime + .upsert_thread(&metadata) + .await + .expect("upsert eligible thread"); + } + + let allowed_sources = vec!["cli".to_string()]; + let first_claims = runtime + .claim_stage1_jobs_for_startup( + current_thread_id, + Stage1StartupClaimParams { + scan_limit: 5_000, + max_claimed: 64, + max_age_days: 30, + min_rollout_idle_hours: 12, + allowed_sources: allowed_sources.as_slice(), + lease_seconds: 3_600, + }, + ) + .await + .expect("first stage1 startup claim"); + assert_eq!(first_claims.len(), 64); + + for claim in first_claims { + assert!( + runtime + .mark_stage1_job_succeeded( + claim.thread.id, + claim.ownership_token.as_str(), + claim.thread.updated_at.timestamp(), + "raw", + "summary", + None, + ) + .await + .expect("mark first-batch stage1 success"), + "first batch stage1 completion should succeed" + ); + } + + let second_claims = runtime + .claim_stage1_jobs_for_startup( + current_thread_id, + Stage1StartupClaimParams { + scan_limit: 5_000, + max_claimed: 64, + max_age_days: 30, + min_rollout_idle_hours: 12, + allowed_sources: allowed_sources.as_slice(), + lease_seconds: 3_600, + }, + ) + .await + .expect("second stage1 startup claim"); + assert_eq!(second_claims.len(), 64); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn stage1_output_cascades_on_thread_delete() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let cwd = codex_home.join("workspace"); + runtime + .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd)) + .await + .expect("upsert thread"); + + let claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) + .await + .expect("claim stage1"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + ownership_token.as_str(), + 100, + "raw", + "sum", + None, + ) + .await + .expect("mark stage1 succeeded"), + "mark stage1 succeeded should write stage1_outputs" + ); + + let count_before = + sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("count before delete") + .try_get::("count") + .expect("count value"); + assert_eq!(count_before, 1); + + sqlx::query("DELETE FROM threads WHERE id = ?") + .bind(thread_id.to_string()) + .execute(runtime.pool.as_ref()) + .await + .expect("delete thread"); + + let count_after = + sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("count after delete") + .try_get::("count") + .expect("count value"); + assert_eq!(count_after, 0); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn mark_stage1_job_succeeded_no_output_skips_phase2_when_output_was_already_absent() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + let claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) + .await + .expect("claim stage1"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded_no_output(thread_id, ownership_token.as_str()) + .await + .expect("mark stage1 succeeded without output"), + "stage1 no-output success should complete the job" + ); + + let output_row_count = + sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load stage1 output count") + .try_get::("count") + .expect("stage1 output count"); + assert_eq!( + output_row_count, 0, + "stage1 no-output success should not persist empty stage1 outputs" + ); + + let up_to_date = runtime + .try_claim_stage1_job(thread_id, owner_b, 100, 3600, 64) + .await + .expect("claim stage1 up-to-date"); + assert_eq!(up_to_date, Stage1JobClaimOutcome::SkippedUpToDate); + + let global_job_row_count = sqlx::query("SELECT COUNT(*) AS count FROM jobs WHERE kind = ?") + .bind("memory_consolidate_global") + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load phase2 job row count") + .try_get::("count") + .expect("phase2 job row count"); + assert_eq!( + global_job_row_count, 0, + "no-output without an existing stage1 output should not enqueue phase2" + ); + + let claim_phase2 = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2"); + assert_eq!( + claim_phase2, + Phase2JobClaimOutcome::SkippedNotDirty, + "phase2 should remain clean when no-output deleted nothing" + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn mark_stage1_job_succeeded_no_output_enqueues_phase2_when_deleting_output() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + let first_claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) + .await + .expect("claim initial stage1"); + let first_token = match first_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected initial stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded(thread_id, first_token.as_str(), 100, "raw", "sum", None) + .await + .expect("mark initial stage1 succeeded"), + "initial stage1 success should create stage1 output" + ); + + let phase2_claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2 after initial output"); + let (phase2_token, phase2_input_watermark) = match phase2_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim after initial output: {other:?}"), + }; + assert_eq!(phase2_input_watermark, 100); + assert!( + runtime + .mark_global_phase2_job_succeeded( + phase2_token.as_str(), + phase2_input_watermark, + &[], + ) + .await + .expect("mark initial phase2 succeeded"), + "initial phase2 success should clear global dirty state" + ); + + let no_output_claim = runtime + .try_claim_stage1_job(thread_id, owner_b, 101, 3600, 64) + .await + .expect("claim stage1 for no-output delete"); + let no_output_token = match no_output_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected no-output stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded_no_output(thread_id, no_output_token.as_str()) + .await + .expect("mark stage1 no-output after existing output"), + "no-output should succeed when deleting an existing stage1 output" + ); + + let output_row_count = + sqlx::query("SELECT COUNT(*) AS count FROM stage1_outputs WHERE thread_id = ?") + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load stage1 output count after delete") + .try_get::("count") + .expect("stage1 output count"); + assert_eq!(output_row_count, 0); + + let claim_phase2 = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2 after no-output deletion"); + let (phase2_token, phase2_input_watermark) = match claim_phase2 { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim after no-output deletion: {other:?}"), + }; + assert_eq!(phase2_input_watermark, 101); + assert!( + runtime + .mark_global_phase2_job_succeeded( + phase2_token.as_str(), + phase2_input_watermark, + &[], + ) + .await + .expect("mark phase2 succeeded after no-output delete") + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn stage1_retry_exhaustion_does_not_block_newer_watermark() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + for attempt in 0..3 { + let claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64) + .await + .expect("claim stage1 for retry exhaustion"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!( + "attempt {} should claim stage1 before retries are exhausted: {other:?}", + attempt + 1 + ), + }; + assert!( + runtime + .mark_stage1_job_failed(thread_id, ownership_token.as_str(), "boom", 0) + .await + .expect("mark stage1 failed"), + "attempt {} should decrement retry budget", + attempt + 1 + ); + } + + let exhausted_claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64) + .await + .expect("claim stage1 after retry exhaustion"); + assert_eq!( + exhausted_claim, + Stage1JobClaimOutcome::SkippedRetryExhausted + ); + + let newer_source_claim = runtime + .try_claim_stage1_job(thread_id, owner, 101, 3_600, 64) + .await + .expect("claim stage1 with newer source watermark"); + assert!( + matches!(newer_source_claim, Stage1JobClaimOutcome::Claimed { .. }), + "newer source watermark should reset retry budget and be claimable" + ); + + let job_row = sqlx::query( + "SELECT retry_remaining, input_watermark FROM jobs WHERE kind = ? AND job_key = ?", + ) + .bind("memory_stage1") + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load stage1 job row after newer-source claim"); + assert_eq!( + job_row + .try_get::("retry_remaining") + .expect("retry_remaining"), + 3 + ); + assert_eq!( + job_row + .try_get::("input_watermark") + .expect("input_watermark"), + 101 + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn phase2_global_consolidation_reruns_when_watermark_advances() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + + runtime + .enqueue_global_consolidation(100) + .await + .expect("enqueue global consolidation"); + + let claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2"); + let (ownership_token, input_watermark) = match claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_global_phase2_job_succeeded(ownership_token.as_str(), input_watermark, &[],) + .await + .expect("mark phase2 succeeded"), + "phase2 success should finalize for current token" + ); + + let claim_up_to_date = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2 up-to-date"); + assert_eq!(claim_up_to_date, Phase2JobClaimOutcome::SkippedNotDirty); + + runtime + .enqueue_global_consolidation(101) + .await + .expect("enqueue global consolidation again"); + + let claim_rerun = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2 rerun"); + assert!( + matches!(claim_rerun, Phase2JobClaimOutcome::Claimed { .. }), + "advanced watermark should be claimable" + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn list_stage1_outputs_for_global_returns_latest_outputs() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let thread_id_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id_a, + codex_home.join("workspace-a"), + )) + .await + .expect("upsert thread a"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id_b, + codex_home.join("workspace-b"), + )) + .await + .expect("upsert thread b"); + + let claim = runtime + .try_claim_stage1_job(thread_id_a, owner, 100, 3600, 64) + .await + .expect("claim stage1 a"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id_a, + ownership_token.as_str(), + 100, + "raw memory a", + "summary a", + None, + ) + .await + .expect("mark stage1 succeeded a"), + "stage1 success should persist output a" + ); + + let claim = runtime + .try_claim_stage1_job(thread_id_b, owner, 101, 3600, 64) + .await + .expect("claim stage1 b"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id_b, + ownership_token.as_str(), + 101, + "raw memory b", + "summary b", + Some("rollout-b"), + ) + .await + .expect("mark stage1 succeeded b"), + "stage1 success should persist output b" + ); + + let outputs = runtime + .list_stage1_outputs_for_global(10) + .await + .expect("list stage1 outputs for global"); + assert_eq!(outputs.len(), 2); + assert_eq!(outputs[0].thread_id, thread_id_b); + assert_eq!(outputs[0].rollout_summary, "summary b"); + assert_eq!(outputs[0].rollout_slug.as_deref(), Some("rollout-b")); + assert_eq!(outputs[0].cwd, codex_home.join("workspace-b")); + assert_eq!(outputs[1].thread_id, thread_id_a); + assert_eq!(outputs[1].rollout_summary, "summary a"); + assert_eq!(outputs[1].rollout_slug, None); + assert_eq!(outputs[1].cwd, codex_home.join("workspace-a")); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn list_stage1_outputs_for_global_skips_empty_payloads() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id_non_empty = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let thread_id_empty = + ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id_non_empty, + codex_home.join("workspace-non-empty"), + )) + .await + .expect("upsert non-empty thread"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id_empty, + codex_home.join("workspace-empty"), + )) + .await + .expect("upsert empty thread"); + + sqlx::query( + r#" +INSERT INTO stage1_outputs (thread_id, source_updated_at, raw_memory, rollout_summary, generated_at) +VALUES (?, ?, ?, ?, ?) + "#, + ) + .bind(thread_id_non_empty.to_string()) + .bind(100_i64) + .bind("raw memory") + .bind("summary") + .bind(100_i64) + .execute(runtime.pool.as_ref()) + .await + .expect("insert non-empty stage1 output"); + sqlx::query( + r#" +INSERT INTO stage1_outputs (thread_id, source_updated_at, raw_memory, rollout_summary, generated_at) +VALUES (?, ?, ?, ?, ?) + "#, + ) + .bind(thread_id_empty.to_string()) + .bind(101_i64) + .bind("") + .bind("") + .bind(101_i64) + .execute(runtime.pool.as_ref()) + .await + .expect("insert empty stage1 output"); + + let outputs = runtime + .list_stage1_outputs_for_global(1) + .await + .expect("list stage1 outputs for global"); + assert_eq!(outputs.len(), 1); + assert_eq!(outputs[0].thread_id, thread_id_non_empty); + assert_eq!(outputs[0].rollout_summary, "summary"); + assert_eq!(outputs[0].cwd, codex_home.join("workspace-non-empty")); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn get_phase2_input_selection_reports_added_retained_and_removed_rows() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let thread_id_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let thread_id_c = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + + for (thread_id, workspace) in [ + (thread_id_a, "workspace-a"), + (thread_id_b, "workspace-b"), + (thread_id_c, "workspace-c"), + ] { + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join(workspace), + )) + .await + .expect("upsert thread"); + } + + for (thread_id, updated_at, slug) in [ + (thread_id_a, 100, Some("rollout-a")), + (thread_id_b, 101, Some("rollout-b")), + (thread_id_c, 102, Some("rollout-c")), + ] { + let claim = runtime + .try_claim_stage1_job(thread_id, owner, updated_at, 3600, 64) + .await + .expect("claim stage1"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + ownership_token.as_str(), + updated_at, + &format!("raw-{updated_at}"), + &format!("summary-{updated_at}"), + slug, + ) + .await + .expect("mark stage1 succeeded"), + "stage1 success should persist output" + ); + } + + let claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2"); + let (ownership_token, input_watermark) = match claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim outcome: {other:?}"), + }; + assert_eq!(input_watermark, 102); + let selected_outputs = runtime + .list_stage1_outputs_for_global(10) + .await + .expect("list stage1 outputs for global") + .into_iter() + .filter(|output| output.thread_id == thread_id_c || output.thread_id == thread_id_a) + .collect::>(); + assert!( + runtime + .mark_global_phase2_job_succeeded( + ownership_token.as_str(), + input_watermark, + &selected_outputs, + ) + .await + .expect("mark phase2 success with selection"), + "phase2 success should persist selected rows" + ); + + let selection = runtime + .get_phase2_input_selection(2) + .await + .expect("load phase2 input selection"); + + assert_eq!(selection.selected.len(), 2); + assert_eq!(selection.previous_selected.len(), 2); + assert_eq!(selection.selected[0].thread_id, thread_id_c); + assert_eq!( + selection.selected[0].rollout_path, + codex_home.join(format!("rollout-{thread_id_c}.jsonl")) + ); + assert_eq!(selection.selected[1].thread_id, thread_id_b); + assert_eq!(selection.retained_thread_ids, vec![thread_id_c]); + + assert_eq!(selection.removed.len(), 1); + assert_eq!(selection.removed[0].thread_id, thread_id_a); + assert_eq!( + selection.removed[0].rollout_slug.as_deref(), + Some("rollout-a") + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn get_phase2_input_selection_treats_regenerated_selected_rows_as_added() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + let first_claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) + .await + .expect("claim initial stage1"); + let first_token = match first_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + first_token.as_str(), + 100, + "raw-100", + "summary-100", + Some("rollout-100"), + ) + .await + .expect("mark initial stage1 success"), + "initial stage1 success should persist output" + ); + + let phase2_claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2"); + let (phase2_token, input_watermark) = match phase2_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim outcome: {other:?}"), + }; + let selected_outputs = runtime + .list_stage1_outputs_for_global(1) + .await + .expect("list selected outputs"); + assert!( + runtime + .mark_global_phase2_job_succeeded( + phase2_token.as_str(), + input_watermark, + &selected_outputs, + ) + .await + .expect("mark phase2 success"), + "phase2 success should persist selected rows" + ); + + let refreshed_claim = runtime + .try_claim_stage1_job(thread_id, owner, 101, 3600, 64) + .await + .expect("claim refreshed stage1"); + let refreshed_token = match refreshed_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + refreshed_token.as_str(), + 101, + "raw-101", + "summary-101", + Some("rollout-101"), + ) + .await + .expect("mark refreshed stage1 success"), + "refreshed stage1 success should persist output" + ); + + let selection = runtime + .get_phase2_input_selection(1) + .await + .expect("load phase2 input selection"); + assert_eq!(selection.selected.len(), 1); + assert_eq!(selection.previous_selected.len(), 1); + assert_eq!(selection.selected[0].thread_id, thread_id); + assert_eq!(selection.selected[0].source_updated_at.timestamp(), 101); + assert!(selection.retained_thread_ids.is_empty()); + assert!(selection.removed.is_empty()); + + let (selected_for_phase2, selected_for_phase2_source_updated_at) = + sqlx::query_as::<_, (i64, Option)>( + "SELECT selected_for_phase2, selected_for_phase2_source_updated_at FROM stage1_outputs WHERE thread_id = ?", + ) + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load selected_for_phase2"); + assert_eq!(selected_for_phase2, 1); + assert_eq!(selected_for_phase2_source_updated_at, Some(100)); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn get_phase2_input_selection_reports_regenerated_previous_selection_as_removed() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread a"); + let thread_id_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread b"); + let thread_id_c = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread c"); + let thread_id_d = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread d"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + + for (thread_id, workspace) in [ + (thread_id_a, "workspace-a"), + (thread_id_b, "workspace-b"), + (thread_id_c, "workspace-c"), + (thread_id_d, "workspace-d"), + ] { + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join(workspace), + )) + .await + .expect("upsert thread"); + } + + for (thread_id, updated_at, slug) in [ + (thread_id_a, 100, Some("rollout-a-100")), + (thread_id_b, 101, Some("rollout-b-101")), + (thread_id_c, 99, Some("rollout-c-99")), + (thread_id_d, 98, Some("rollout-d-98")), + ] { + let claim = runtime + .try_claim_stage1_job(thread_id, owner, updated_at, 3600, 64) + .await + .expect("claim initial stage1"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + ownership_token.as_str(), + updated_at, + &format!("raw-{updated_at}"), + &format!("summary-{updated_at}"), + slug, + ) + .await + .expect("mark stage1 succeeded"), + "stage1 success should persist output" + ); + } + + let phase2_claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2"); + let (phase2_token, input_watermark) = match phase2_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim outcome: {other:?}"), + }; + let selected_outputs = runtime + .list_stage1_outputs_for_global(2) + .await + .expect("list selected outputs"); + assert_eq!( + selected_outputs + .iter() + .map(|output| output.thread_id) + .collect::>(), + vec![thread_id_b, thread_id_a] + ); + assert!( + runtime + .mark_global_phase2_job_succeeded( + phase2_token.as_str(), + input_watermark, + &selected_outputs, + ) + .await + .expect("mark phase2 success"), + "phase2 success should persist selected rows" + ); + + for (thread_id, updated_at, slug) in [ + (thread_id_a, 102, Some("rollout-a-102")), + (thread_id_c, 103, Some("rollout-c-103")), + (thread_id_d, 104, Some("rollout-d-104")), + ] { + let claim = runtime + .try_claim_stage1_job(thread_id, owner, updated_at, 3600, 64) + .await + .expect("claim refreshed stage1"); + let ownership_token = match claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + ownership_token.as_str(), + updated_at, + &format!("raw-{updated_at}"), + &format!("summary-{updated_at}"), + slug, + ) + .await + .expect("mark refreshed stage1 success"), + "refreshed stage1 success should persist output" + ); + } + + let selection = runtime + .get_phase2_input_selection(2) + .await + .expect("load phase2 input selection"); + assert_eq!( + selection + .selected + .iter() + .map(|output| output.thread_id) + .collect::>(), + vec![thread_id_d, thread_id_c] + ); + assert_eq!( + selection + .previous_selected + .iter() + .map(|output| output.thread_id) + .collect::>(), + vec![thread_id_a, thread_id_b] + ); + assert!(selection.retained_thread_ids.is_empty()); + assert_eq!( + selection + .removed + .iter() + .map(|output| (output.thread_id, output.source_updated_at.timestamp())) + .collect::>(), + vec![(thread_id_a, 102), (thread_id_b, 101)] + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn mark_global_phase2_job_succeeded_updates_selected_snapshot_timestamp() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + let initial_claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) + .await + .expect("claim initial stage1"); + let initial_token = match initial_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + initial_token.as_str(), + 100, + "raw-100", + "summary-100", + Some("rollout-100"), + ) + .await + .expect("mark initial stage1 success"), + "initial stage1 success should persist output" + ); + + let first_phase2_claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim first phase2"); + let (first_phase2_token, first_input_watermark) = match first_phase2_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected first phase2 claim outcome: {other:?}"), + }; + let first_selected_outputs = runtime + .list_stage1_outputs_for_global(1) + .await + .expect("list first selected outputs"); + assert!( + runtime + .mark_global_phase2_job_succeeded( + first_phase2_token.as_str(), + first_input_watermark, + &first_selected_outputs, + ) + .await + .expect("mark first phase2 success"), + "first phase2 success should persist selected rows" + ); + + let refreshed_claim = runtime + .try_claim_stage1_job(thread_id, owner, 101, 3600, 64) + .await + .expect("claim refreshed stage1"); + let refreshed_token = match refreshed_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected refreshed stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + refreshed_token.as_str(), + 101, + "raw-101", + "summary-101", + Some("rollout-101"), + ) + .await + .expect("mark refreshed stage1 success"), + "refreshed stage1 success should persist output" + ); + + let second_phase2_claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim second phase2"); + let (second_phase2_token, second_input_watermark) = match second_phase2_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected second phase2 claim outcome: {other:?}"), + }; + let second_selected_outputs = runtime + .list_stage1_outputs_for_global(1) + .await + .expect("list second selected outputs"); + assert_eq!( + second_selected_outputs[0].source_updated_at.timestamp(), + 101 + ); + assert!( + runtime + .mark_global_phase2_job_succeeded( + second_phase2_token.as_str(), + second_input_watermark, + &second_selected_outputs, + ) + .await + .expect("mark second phase2 success"), + "second phase2 success should persist selected rows" + ); + + let selection = runtime + .get_phase2_input_selection(1) + .await + .expect("load phase2 input selection after refresh"); + assert_eq!(selection.retained_thread_ids, vec![thread_id]); + + let (selected_for_phase2, selected_for_phase2_source_updated_at) = + sqlx::query_as::<_, (i64, Option)>( + "SELECT selected_for_phase2, selected_for_phase2_source_updated_at FROM stage1_outputs WHERE thread_id = ?", + ) + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load selected snapshot after phase2"); + assert_eq!(selected_for_phase2, 1); + assert_eq!(selected_for_phase2_source_updated_at, Some(101)); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn mark_global_phase2_job_succeeded_only_marks_exact_selected_snapshots() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_id, + codex_home.join("workspace"), + )) + .await + .expect("upsert thread"); + + let initial_claim = runtime + .try_claim_stage1_job(thread_id, owner, 100, 3600, 64) + .await + .expect("claim initial stage1"); + let initial_token = match initial_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + initial_token.as_str(), + 100, + "raw-100", + "summary-100", + Some("rollout-100"), + ) + .await + .expect("mark initial stage1 success"), + "initial stage1 success should persist output" + ); + + let phase2_claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim phase2"); + let (phase2_token, input_watermark) = match phase2_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected phase2 claim outcome: {other:?}"), + }; + let selected_outputs = runtime + .list_stage1_outputs_for_global(1) + .await + .expect("list selected outputs"); + assert_eq!(selected_outputs[0].source_updated_at.timestamp(), 100); + + let refreshed_claim = runtime + .try_claim_stage1_job(thread_id, owner, 101, 3600, 64) + .await + .expect("claim refreshed stage1"); + let refreshed_token = match refreshed_claim { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_id, + refreshed_token.as_str(), + 101, + "raw-101", + "summary-101", + Some("rollout-101"), + ) + .await + .expect("mark refreshed stage1 success"), + "refreshed stage1 success should persist output" + ); + + assert!( + runtime + .mark_global_phase2_job_succeeded( + phase2_token.as_str(), + input_watermark, + &selected_outputs, + ) + .await + .expect("mark phase2 success"), + "phase2 success should still complete" + ); + + let (selected_for_phase2, selected_for_phase2_source_updated_at) = + sqlx::query_as::<_, (i64, Option)>( + "SELECT selected_for_phase2, selected_for_phase2_source_updated_at FROM stage1_outputs WHERE thread_id = ?", + ) + .bind(thread_id.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load selected_for_phase2"); + assert_eq!(selected_for_phase2, 0); + assert_eq!(selected_for_phase2_source_updated_at, None); + + let selection = runtime + .get_phase2_input_selection(1) + .await + .expect("load phase2 input selection"); + assert_eq!(selection.selected.len(), 1); + assert_eq!(selection.selected[0].source_updated_at.timestamp(), 101); + assert!(selection.retained_thread_ids.is_empty()); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn record_stage1_output_usage_updates_usage_metadata() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id a"); + let thread_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id b"); + let missing = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("missing id"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_a, + codex_home.join("workspace-a"), + )) + .await + .expect("upsert thread a"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_b, + codex_home.join("workspace-b"), + )) + .await + .expect("upsert thread b"); + + let claim_a = runtime + .try_claim_stage1_job(thread_a, owner, 100, 3600, 64) + .await + .expect("claim stage1 a"); + let token_a = match claim_a { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome for a: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded(thread_a, token_a.as_str(), 100, "raw a", "sum a", None) + .await + .expect("mark stage1 succeeded a") + ); + + let claim_b = runtime + .try_claim_stage1_job(thread_b, owner, 101, 3600, 64) + .await + .expect("claim stage1 b"); + let token_b = match claim_b { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome for b: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded(thread_b, token_b.as_str(), 101, "raw b", "sum b", None) + .await + .expect("mark stage1 succeeded b") + ); + + let updated_rows = runtime + .record_stage1_output_usage(&[thread_a, thread_a, thread_b, missing]) + .await + .expect("record stage1 output usage"); + assert_eq!(updated_rows, 3); + + let row_a = + sqlx::query("SELECT usage_count, last_usage FROM stage1_outputs WHERE thread_id = ?") + .bind(thread_a.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load stage1 usage row a"); + let row_b = + sqlx::query("SELECT usage_count, last_usage FROM stage1_outputs WHERE thread_id = ?") + .bind(thread_b.to_string()) + .fetch_one(runtime.pool.as_ref()) + .await + .expect("load stage1 usage row b"); + + assert_eq!( + row_a + .try_get::("usage_count") + .expect("usage_count a"), + 2 + ); + assert_eq!( + row_b + .try_get::("usage_count") + .expect("usage_count b"), + 1 + ); + + let last_usage_a = row_a.try_get::("last_usage").expect("last_usage a"); + let last_usage_b = row_b.try_get::("last_usage").expect("last_usage b"); + assert_eq!(last_usage_a, last_usage_b); + assert!(last_usage_a > 0); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn mark_stage1_job_succeeded_enqueues_global_consolidation() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + let thread_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id a"); + let thread_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id b"); + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id"); + + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_a, + codex_home.join("workspace-a"), + )) + .await + .expect("upsert thread a"); + runtime + .upsert_thread(&test_thread_metadata( + &codex_home, + thread_b, + codex_home.join("workspace-b"), + )) + .await + .expect("upsert thread b"); + + let claim_a = runtime + .try_claim_stage1_job(thread_a, owner, 100, 3600, 64) + .await + .expect("claim stage1 a"); + let token_a = match claim_a { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome for thread a: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_a, + token_a.as_str(), + 100, + "raw-a", + "summary-a", + None, + ) + .await + .expect("mark stage1 succeeded a"), + "stage1 success should persist output for thread a" + ); + + let claim_b = runtime + .try_claim_stage1_job(thread_b, owner, 101, 3600, 64) + .await + .expect("claim stage1 b"); + let token_b = match claim_b { + Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token, + other => panic!("unexpected stage1 claim outcome for thread b: {other:?}"), + }; + assert!( + runtime + .mark_stage1_job_succeeded( + thread_b, + token_b.as_str(), + 101, + "raw-b", + "summary-b", + None, + ) + .await + .expect("mark stage1 succeeded b"), + "stage1 success should persist output for thread b" + ); + + let claim = runtime + .try_claim_global_phase2_job(owner, 3600) + .await + .expect("claim global consolidation"); + let input_watermark = match claim { + Phase2JobClaimOutcome::Claimed { + input_watermark, .. + } => input_watermark, + other => panic!("unexpected global consolidation claim outcome: {other:?}"), + }; + assert_eq!(input_watermark, 101); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn phase2_global_lock_allows_only_one_fresh_runner() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + runtime + .enqueue_global_consolidation(200) + .await + .expect("enqueue global consolidation"); + + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b"); + + let running_claim = runtime + .try_claim_global_phase2_job(owner_a, 3600) + .await + .expect("claim global lock"); + assert!( + matches!(running_claim, Phase2JobClaimOutcome::Claimed { .. }), + "first owner should claim global lock" + ); + + let second_claim = runtime + .try_claim_global_phase2_job(owner_b, 3600) + .await + .expect("claim global lock from second owner"); + assert_eq!(second_claim, Phase2JobClaimOutcome::SkippedRunning); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn phase2_global_lock_stale_lease_allows_takeover() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + runtime + .enqueue_global_consolidation(300) + .await + .expect("enqueue global consolidation"); + + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a"); + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b"); + + let initial_claim = runtime + .try_claim_global_phase2_job(owner_a, 3600) + .await + .expect("claim initial global lock"); + let token_a = match initial_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, .. + } => ownership_token, + other => panic!("unexpected initial claim outcome: {other:?}"), + }; + + sqlx::query("UPDATE jobs SET lease_until = ? WHERE kind = ? AND job_key = ?") + .bind(Utc::now().timestamp() - 1) + .bind("memory_consolidate_global") + .bind("global") + .execute(runtime.pool.as_ref()) + .await + .expect("expire global consolidation lease"); + + let takeover_claim = runtime + .try_claim_global_phase2_job(owner_b, 3600) + .await + .expect("claim stale global lock"); + let (token_b, input_watermark) = match takeover_claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => (ownership_token, input_watermark), + other => panic!("unexpected takeover claim outcome: {other:?}"), + }; + assert_ne!(token_a, token_b); + assert_eq!(input_watermark, 300); + + assert_eq!( + runtime + .mark_global_phase2_job_succeeded(token_a.as_str(), 300, &[]) + .await + .expect("mark stale owner success result"), + false, + "stale owner should lose finalization ownership after takeover" + ); + assert!( + runtime + .mark_global_phase2_job_succeeded(token_b.as_str(), 300, &[]) + .await + .expect("mark takeover owner success"), + "takeover owner should finalize consolidation" + ); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn phase2_backfilled_inputs_below_last_success_still_become_dirty() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + runtime + .enqueue_global_consolidation(500) + .await + .expect("enqueue initial consolidation"); + let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a"); + let claim_a = runtime + .try_claim_global_phase2_job(owner_a, 3_600) + .await + .expect("claim initial consolidation"); + let token_a = match claim_a { + Phase2JobClaimOutcome::Claimed { + ownership_token, + input_watermark, + } => { + assert_eq!(input_watermark, 500); + ownership_token + } + other => panic!("unexpected initial phase2 claim outcome: {other:?}"), + }; + assert!( + runtime + .mark_global_phase2_job_succeeded(token_a.as_str(), 500, &[]) + .await + .expect("mark initial phase2 success"), + "initial phase2 success should finalize" + ); + + runtime + .enqueue_global_consolidation(400) + .await + .expect("enqueue backfilled consolidation"); + + let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b"); + let claim_b = runtime + .try_claim_global_phase2_job(owner_b, 3_600) + .await + .expect("claim backfilled consolidation"); + match claim_b { + Phase2JobClaimOutcome::Claimed { + input_watermark, .. + } => { + assert!( + input_watermark > 500, + "backfilled enqueue should advance dirty watermark beyond last success" + ); + } + other => panic!("unexpected backfilled phase2 claim outcome: {other:?}"), + } + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } + + #[tokio::test] + async fn phase2_failure_fallback_updates_unowned_running_job() { + let codex_home = unique_temp_dir(); + let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None) + .await + .expect("initialize runtime"); + + runtime + .enqueue_global_consolidation(400) + .await + .expect("enqueue global consolidation"); + + let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner"); + let claim = runtime + .try_claim_global_phase2_job(owner, 3_600) + .await + .expect("claim global consolidation"); + let ownership_token = match claim { + Phase2JobClaimOutcome::Claimed { + ownership_token, .. + } => ownership_token, + other => panic!("unexpected claim outcome: {other:?}"), + }; + + sqlx::query("UPDATE jobs SET ownership_token = NULL WHERE kind = ? AND job_key = ?") + .bind("memory_consolidate_global") + .bind("global") + .execute(runtime.pool.as_ref()) + .await + .expect("clear ownership token"); + + assert_eq!( + runtime + .mark_global_phase2_job_failed(ownership_token.as_str(), "lost", 3_600) + .await + .expect("mark phase2 failed with strict ownership"), + false, + "strict failure update should not match unowned running job" + ); + assert!( + runtime + .mark_global_phase2_job_failed_if_unowned(ownership_token.as_str(), "lost", 3_600) + .await + .expect("fallback failure update should match unowned running job"), + "fallback failure update should transition the unowned running job" + ); + + let claim = runtime + .try_claim_global_phase2_job(ThreadId::new(), 3_600) + .await + .expect("claim after fallback failure"); + assert_eq!(claim, Phase2JobClaimOutcome::SkippedNotDirty); + + let _ = tokio::fs::remove_dir_all(codex_home).await; + } +} diff --git a/codex-rs/state/src/runtime/test_support.rs b/codex-rs/state/src/runtime/test_support.rs new file mode 100644 index 000000000..d749fe2bf --- /dev/null +++ b/codex-rs/state/src/runtime/test_support.rs @@ -0,0 +1,64 @@ +#[cfg(test)] +use chrono::DateTime; +#[cfg(test)] +use chrono::Utc; +#[cfg(test)] +use codex_protocol::ThreadId; +#[cfg(test)] +use codex_protocol::protocol::AskForApproval; +#[cfg(test)] +use codex_protocol::protocol::SandboxPolicy; +#[cfg(test)] +use std::path::Path; +#[cfg(test)] +use std::path::PathBuf; +#[cfg(test)] +use std::time::SystemTime; +#[cfg(test)] +use std::time::UNIX_EPOCH; +#[cfg(test)] +use uuid::Uuid; + +#[cfg(test)] +use crate::ThreadMetadata; + +#[cfg(test)] +pub(super) fn unique_temp_dir() -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |duration| duration.as_nanos()); + std::env::temp_dir().join(format!( + "codex-state-runtime-test-{nanos}-{}", + Uuid::new_v4() + )) +} + +#[cfg(test)] +pub(super) fn test_thread_metadata( + codex_home: &Path, + thread_id: ThreadId, + cwd: PathBuf, +) -> ThreadMetadata { + let now = DateTime::::from_timestamp(1_700_000_000, 0).expect("timestamp"); + ThreadMetadata { + id: thread_id, + rollout_path: codex_home.join(format!("rollout-{thread_id}.jsonl")), + created_at: now, + updated_at: now, + source: "cli".to_string(), + agent_nickname: None, + agent_role: None, + model_provider: "test-provider".to_string(), + cwd, + cli_version: "0.0.0".to_string(), + title: String::new(), + sandbox_policy: crate::extract::enum_to_string(&SandboxPolicy::new_read_only_policy()), + approval_mode: crate::extract::enum_to_string(&AskForApproval::OnRequest), + tokens_used: 0, + first_user_message: Some("hello".to_string()), + archived_at: None, + git_sha: None, + git_branch: None, + git_origin_url: None, + } +} diff --git a/codex-rs/state/src/runtime/threads.rs b/codex-rs/state/src/runtime/threads.rs new file mode 100644 index 000000000..0eb64d6be --- /dev/null +++ b/codex-rs/state/src/runtime/threads.rs @@ -0,0 +1,496 @@ +use super::*; + +impl StateRuntime { + pub async fn get_thread(&self, id: ThreadId) -> anyhow::Result> { + let row = sqlx::query( + r#" +SELECT + id, + rollout_path, + created_at, + updated_at, + source, + agent_nickname, + agent_role, + model_provider, + cwd, + cli_version, + title, + sandbox_policy, + approval_mode, + tokens_used, + first_user_message, + archived_at, + git_sha, + git_branch, + git_origin_url +FROM threads +WHERE id = ? + "#, + ) + .bind(id.to_string()) + .fetch_optional(self.pool.as_ref()) + .await?; + row.map(|row| ThreadRow::try_from_row(&row).and_then(ThreadMetadata::try_from)) + .transpose() + } + + /// Get dynamic tools for a thread, if present. + pub async fn get_dynamic_tools( + &self, + thread_id: ThreadId, + ) -> anyhow::Result>> { + let rows = sqlx::query( + r#" +SELECT name, description, input_schema +FROM thread_dynamic_tools +WHERE thread_id = ? +ORDER BY position ASC + "#, + ) + .bind(thread_id.to_string()) + .fetch_all(self.pool.as_ref()) + .await?; + if rows.is_empty() { + return Ok(None); + } + let mut tools = Vec::with_capacity(rows.len()); + for row in rows { + let input_schema: String = row.try_get("input_schema")?; + let input_schema = serde_json::from_str::(input_schema.as_str())?; + tools.push(DynamicToolSpec { + name: row.try_get("name")?, + description: row.try_get("description")?, + input_schema, + }); + } + Ok(Some(tools)) + } + + /// Find a rollout path by thread id using the underlying database. + pub async fn find_rollout_path_by_id( + &self, + id: ThreadId, + archived_only: Option, + ) -> anyhow::Result> { + let mut builder = + QueryBuilder::::new("SELECT rollout_path FROM threads WHERE id = "); + builder.push_bind(id.to_string()); + match archived_only { + Some(true) => { + builder.push(" AND archived = 1"); + } + Some(false) => { + builder.push(" AND archived = 0"); + } + None => {} + } + let row = builder.build().fetch_optional(self.pool.as_ref()).await?; + Ok(row + .and_then(|r| r.try_get::("rollout_path").ok()) + .map(PathBuf::from)) + } + + /// List threads using the underlying database. + #[allow(clippy::too_many_arguments)] + pub async fn list_threads( + &self, + page_size: usize, + anchor: Option<&crate::Anchor>, + sort_key: crate::SortKey, + allowed_sources: &[String], + model_providers: Option<&[String]>, + archived_only: bool, + search_term: Option<&str>, + ) -> anyhow::Result { + let limit = page_size.saturating_add(1); + + let mut builder = QueryBuilder::::new( + r#" +SELECT + id, + rollout_path, + created_at, + updated_at, + source, + agent_nickname, + agent_role, + model_provider, + cwd, + cli_version, + title, + sandbox_policy, + approval_mode, + tokens_used, + first_user_message, + archived_at, + git_sha, + git_branch, + git_origin_url +FROM threads + "#, + ); + push_thread_filters( + &mut builder, + archived_only, + allowed_sources, + model_providers, + anchor, + sort_key, + search_term, + ); + push_thread_order_and_limit(&mut builder, sort_key, limit); + + let rows = builder.build().fetch_all(self.pool.as_ref()).await?; + let mut items = rows + .into_iter() + .map(|row| ThreadRow::try_from_row(&row).and_then(ThreadMetadata::try_from)) + .collect::, _>>()?; + let num_scanned_rows = items.len(); + let next_anchor = if items.len() > page_size { + items.pop(); + items + .last() + .and_then(|item| anchor_from_item(item, sort_key)) + } else { + None + }; + Ok(ThreadsPage { + items, + next_anchor, + num_scanned_rows, + }) + } + + /// List thread ids using the underlying database (no rollout scanning). + pub async fn list_thread_ids( + &self, + limit: usize, + anchor: Option<&crate::Anchor>, + sort_key: crate::SortKey, + allowed_sources: &[String], + model_providers: Option<&[String]>, + archived_only: bool, + ) -> anyhow::Result> { + let mut builder = QueryBuilder::::new("SELECT id FROM threads"); + push_thread_filters( + &mut builder, + archived_only, + allowed_sources, + model_providers, + anchor, + sort_key, + None, + ); + push_thread_order_and_limit(&mut builder, sort_key, limit); + + let rows = builder.build().fetch_all(self.pool.as_ref()).await?; + rows.into_iter() + .map(|row| { + let id: String = row.try_get("id")?; + Ok(ThreadId::try_from(id)?) + }) + .collect() + } + + /// Insert or replace thread metadata directly. + pub async fn upsert_thread(&self, metadata: &crate::ThreadMetadata) -> anyhow::Result<()> { + sqlx::query( + r#" +INSERT INTO threads ( + id, + rollout_path, + created_at, + updated_at, + source, + agent_nickname, + agent_role, + model_provider, + cwd, + cli_version, + title, + sandbox_policy, + approval_mode, + tokens_used, + first_user_message, + archived, + archived_at, + git_sha, + git_branch, + git_origin_url +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT(id) DO UPDATE SET + rollout_path = excluded.rollout_path, + created_at = excluded.created_at, + updated_at = excluded.updated_at, + source = excluded.source, + agent_nickname = excluded.agent_nickname, + agent_role = excluded.agent_role, + model_provider = excluded.model_provider, + cwd = excluded.cwd, + cli_version = excluded.cli_version, + title = excluded.title, + sandbox_policy = excluded.sandbox_policy, + approval_mode = excluded.approval_mode, + tokens_used = excluded.tokens_used, + first_user_message = excluded.first_user_message, + archived = excluded.archived, + archived_at = excluded.archived_at, + git_sha = excluded.git_sha, + git_branch = excluded.git_branch, + git_origin_url = excluded.git_origin_url + "#, + ) + .bind(metadata.id.to_string()) + .bind(metadata.rollout_path.display().to_string()) + .bind(datetime_to_epoch_seconds(metadata.created_at)) + .bind(datetime_to_epoch_seconds(metadata.updated_at)) + .bind(metadata.source.as_str()) + .bind(metadata.agent_nickname.as_deref()) + .bind(metadata.agent_role.as_deref()) + .bind(metadata.model_provider.as_str()) + .bind(metadata.cwd.display().to_string()) + .bind(metadata.cli_version.as_str()) + .bind(metadata.title.as_str()) + .bind(metadata.sandbox_policy.as_str()) + .bind(metadata.approval_mode.as_str()) + .bind(metadata.tokens_used) + .bind(metadata.first_user_message.as_deref().unwrap_or_default()) + .bind(metadata.archived_at.is_some()) + .bind(metadata.archived_at.map(datetime_to_epoch_seconds)) + .bind(metadata.git_sha.as_deref()) + .bind(metadata.git_branch.as_deref()) + .bind(metadata.git_origin_url.as_deref()) + .execute(self.pool.as_ref()) + .await?; + Ok(()) + } + + /// Persist dynamic tools for a thread if none have been stored yet. + /// + /// Dynamic tools are defined at thread start and should not change afterward. + /// This only writes the first time we see tools for a given thread. + pub async fn persist_dynamic_tools( + &self, + thread_id: ThreadId, + tools: Option<&[DynamicToolSpec]>, + ) -> anyhow::Result<()> { + let Some(tools) = tools else { + return Ok(()); + }; + if tools.is_empty() { + return Ok(()); + } + let thread_id = thread_id.to_string(); + let mut tx = self.pool.begin().await?; + for (idx, tool) in tools.iter().enumerate() { + let position = i64::try_from(idx).unwrap_or(i64::MAX); + let input_schema = serde_json::to_string(&tool.input_schema)?; + sqlx::query( + r#" +INSERT INTO thread_dynamic_tools ( + thread_id, + position, + name, + description, + input_schema +) VALUES (?, ?, ?, ?, ?) +ON CONFLICT(thread_id, position) DO NOTHING + "#, + ) + .bind(thread_id.as_str()) + .bind(position) + .bind(tool.name.as_str()) + .bind(tool.description.as_str()) + .bind(input_schema) + .execute(&mut *tx) + .await?; + } + tx.commit().await?; + Ok(()) + } + + /// Apply rollout items incrementally using the underlying database. + pub async fn apply_rollout_items( + &self, + builder: &ThreadMetadataBuilder, + items: &[RolloutItem], + otel: Option<&OtelManager>, + ) -> anyhow::Result<()> { + if items.is_empty() { + return Ok(()); + } + let mut metadata = self + .get_thread(builder.id) + .await? + .unwrap_or_else(|| builder.build(&self.default_provider)); + metadata.rollout_path = builder.rollout_path.clone(); + for item in items { + apply_rollout_item(&mut metadata, item, &self.default_provider); + } + if let Some(updated_at) = file_modified_time_utc(builder.rollout_path.as_path()).await { + metadata.updated_at = updated_at; + } + // Keep the thread upsert before dynamic tools to satisfy the foreign key constraint: + // thread_dynamic_tools.thread_id -> threads.id. + if let Err(err) = self.upsert_thread(&metadata).await { + if let Some(otel) = otel { + otel.counter(DB_ERROR_METRIC, 1, &[("stage", "apply_rollout_items")]); + } + return Err(err); + } + let dynamic_tools = extract_dynamic_tools(items); + if let Some(dynamic_tools) = dynamic_tools + && let Err(err) = self + .persist_dynamic_tools(builder.id, dynamic_tools.as_deref()) + .await + { + if let Some(otel) = otel { + otel.counter(DB_ERROR_METRIC, 1, &[("stage", "persist_dynamic_tools")]); + } + return Err(err); + } + Ok(()) + } + + /// Mark a thread as archived using the underlying database. + pub async fn mark_archived( + &self, + thread_id: ThreadId, + rollout_path: &Path, + archived_at: DateTime, + ) -> anyhow::Result<()> { + let Some(mut metadata) = self.get_thread(thread_id).await? else { + return Ok(()); + }; + metadata.archived_at = Some(archived_at); + metadata.rollout_path = rollout_path.to_path_buf(); + if let Some(updated_at) = file_modified_time_utc(rollout_path).await { + metadata.updated_at = updated_at; + } + if metadata.id != thread_id { + warn!( + "thread id mismatch during archive: expected {thread_id}, got {}", + metadata.id + ); + } + self.upsert_thread(&metadata).await + } + + /// Mark a thread as unarchived using the underlying database. + pub async fn mark_unarchived( + &self, + thread_id: ThreadId, + rollout_path: &Path, + ) -> anyhow::Result<()> { + let Some(mut metadata) = self.get_thread(thread_id).await? else { + return Ok(()); + }; + metadata.archived_at = None; + metadata.rollout_path = rollout_path.to_path_buf(); + if let Some(updated_at) = file_modified_time_utc(rollout_path).await { + metadata.updated_at = updated_at; + } + if metadata.id != thread_id { + warn!( + "thread id mismatch during unarchive: expected {thread_id}, got {}", + metadata.id + ); + } + self.upsert_thread(&metadata).await + } + + /// Delete a thread metadata row by id. + pub async fn delete_thread(&self, thread_id: ThreadId) -> anyhow::Result { + let result = sqlx::query("DELETE FROM threads WHERE id = ?") + .bind(thread_id.to_string()) + .execute(self.pool.as_ref()) + .await?; + Ok(result.rows_affected()) + } +} + +pub(super) fn extract_dynamic_tools(items: &[RolloutItem]) -> Option>> { + items.iter().find_map(|item| match item { + RolloutItem::SessionMeta(meta_line) => Some(meta_line.meta.dynamic_tools.clone()), + RolloutItem::ResponseItem(_) + | RolloutItem::Compacted(_) + | RolloutItem::TurnContext(_) + | RolloutItem::EventMsg(_) => None, + }) +} + +pub(super) fn push_thread_filters<'a>( + builder: &mut QueryBuilder<'a, Sqlite>, + archived_only: bool, + allowed_sources: &'a [String], + model_providers: Option<&'a [String]>, + anchor: Option<&crate::Anchor>, + sort_key: SortKey, + search_term: Option<&'a str>, +) { + builder.push(" WHERE 1 = 1"); + if archived_only { + builder.push(" AND archived = 1"); + } else { + builder.push(" AND archived = 0"); + } + builder.push(" AND first_user_message <> ''"); + if !allowed_sources.is_empty() { + builder.push(" AND source IN ("); + let mut separated = builder.separated(", "); + for source in allowed_sources { + separated.push_bind(source); + } + separated.push_unseparated(")"); + } + if let Some(model_providers) = model_providers + && !model_providers.is_empty() + { + builder.push(" AND model_provider IN ("); + let mut separated = builder.separated(", "); + for provider in model_providers { + separated.push_bind(provider); + } + separated.push_unseparated(")"); + } + if let Some(search_term) = search_term { + builder.push(" AND instr(title, "); + builder.push_bind(search_term); + builder.push(") > 0"); + } + if let Some(anchor) = anchor { + let anchor_ts = datetime_to_epoch_seconds(anchor.ts); + let column = match sort_key { + SortKey::CreatedAt => "created_at", + SortKey::UpdatedAt => "updated_at", + }; + builder.push(" AND ("); + builder.push(column); + builder.push(" < "); + builder.push_bind(anchor_ts); + builder.push(" OR ("); + builder.push(column); + builder.push(" = "); + builder.push_bind(anchor_ts); + builder.push(" AND id < "); + builder.push_bind(anchor.id.to_string()); + builder.push("))"); + } +} + +pub(super) fn push_thread_order_and_limit( + builder: &mut QueryBuilder<'_, Sqlite>, + sort_key: SortKey, + limit: usize, +) { + let order_column = match sort_key { + SortKey::CreatedAt => "created_at", + SortKey::UpdatedAt => "updated_at", + }; + builder.push(" ORDER BY "); + builder.push(order_column); + builder.push(" DESC, id DESC"); + builder.push(" LIMIT "); + builder.push_bind(limit as i64); +}