Files
codex/codex-rs/rollout/src/metadata.rs
Ruslan Nigmatullin 4d201e340e state: pass state db handles through consumers (#20561)
## Why

SQLite state was still being opened from consumer paths, including lazy
`OnceCell`-backed thread-store call sites. That let one process
construct multiple state DB connections for the same Codex home, which
makes SQLite lock contention and `database is locked` failures much
easier to hit.

State DB lifetime should be chosen by main-like entrypoints and tests,
then passed through explicitly. Consumers should use the supplied
`Option<StateDbHandle>` or `StateDbHandle` and keep their existing
filesystem fallback or error behavior when no handle is available.

The startup path also needs to keep the rollout crate in charge of
SQLite state initialization. Opening `codex_state::StateRuntime`
directly bypasses rollout metadata backfill, so entrypoints should
initialize through `codex_rollout::state_db` and receive a handle only
after required rollout backfills have completed.

## What Changed

- Initialize the state DB in main-like entrypoints for CLI, TUI,
app-server, exec, MCP server, and the thread-manager sample.
- Pass `Option<StateDbHandle>` through `ThreadManager`,
`LocalThreadStore`, app-server processors, TUI app wiring, rollout
listing/recording, personality migration, shell snapshot cleanup,
session-name lookup, and memory/device-key consumers.
- Remove the lazy local state DB wrapper from the thread store so
non-test consumers use only the supplied handle or their existing
fallback path.
- Make `codex_rollout::state_db::init` the local state startup path: it
opens/migrates SQLite, runs rollout metadata backfill when needed, waits
for concurrent backfill workers up to a bounded timeout, verifies
completion, and then returns the initialized handle.
- Keep optional/non-owning SQLite helpers, such as remote TUI local
reads, as open-only paths that do not run startup backfill.
- Switch app-server startup from direct
`codex_state::StateRuntime::init` to the rollout state initializer so
app-server cannot skip rollout backfill.
- Collapse split rollout lookup/list APIs so callers use the normal
methods with an optional state handle instead of `_with_state_db`
variants.
- Restore `getConversationSummary(ThreadId)` to delegate through
`ThreadStore::read_thread` instead of a LocalThreadStore-specific
rollout path special case.
- Keep DB-backed rollout path lookup keyed on the DB row and file
existence, without imposing the filesystem filename convention on
existing DB rows.
- Verify readable DB-backed rollout paths against `session_meta.id`
before returning them, so a stale SQLite row that points at another
thread's JSONL falls back to filesystem search and read-repairs the DB
row.
- Keep `debug prompt-input` filesystem-only so a one-off debug command
does not initialize or backfill SQLite state just to print prompt input.
- Keep goal-session test Codex homes alive only in the goal-specific
helper, rather than leaking tempdirs from the shared session test
helper.
- Update tests and call sites to pass explicit state handles where DB
behavior is expected and explicit `None` where filesystem-only behavior
is intended.

## Validation

- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo check -p
codex-rollout -p codex-thread-store -p codex-app-server -p codex-core -p
codex-tui -p codex-exec -p codex-cli --tests`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-rollout state_db_`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-rollout find_thread_path`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-rollout find_thread_path -- --nocapture`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-rollout try_init_ -- --nocapture`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-rollout`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo clippy -p
codex-rollout --lib -- -D warnings`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-thread-store
read_thread_falls_back_when_sqlite_path_points_to_another_thread --
--nocapture`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-thread-store`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p codex-core
shell_snapshot`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p codex-core
--test all personality_migration`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p codex-core
--test all rollout_list_find`
- `RUST_MIN_STACK=8388608 CODEX_SKIP_VENDORED_BWRAP=1
CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p codex-core
--test all rollout_list_find::find_prefers_sqlite_path_by_id --
--nocapture`
- `RUST_MIN_STACK=8388608 CODEX_SKIP_VENDORED_BWRAP=1
CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p codex-core
--test all rollout_list_find -- --nocapture`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p codex-core
interrupt_accounts_active_goal_before_pausing`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-app-server get_auth_status -- --test-threads=1`
- `CODEX_SKIP_VENDORED_BWRAP=1
CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo test -p
codex-app-server --lib`
- `CODEX_SKIP_VENDORED_BWRAP=1
CARGO_TARGET_DIR=/tmp/codex-target-state-db cargo check -p codex-rollout
-p codex-app-server --tests`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db just fix -p codex-rollout
-p codex-thread-store -p codex-core -p codex-app-server -p codex-tui -p
codex-exec -p codex-cli`
- `CODEX_SKIP_VENDORED_BWRAP=1
CARGO_TARGET_DIR=/tmp/codex-target-state-db just fix -p codex-rollout -p
codex-app-server`
- `CARGO_TARGET_DIR=/tmp/codex-target-state-db just fix -p
codex-rollout`
- `CODEX_SKIP_VENDORED_BWRAP=1
CARGO_TARGET_DIR=/tmp/codex-target-state-db just fix -p codex-core`
- `just argument-comment-lint -p codex-core`
- `just argument-comment-lint -p codex-rollout`

Focused coverage added in `codex-rollout`:

- `recorder::tests::state_db_init_backfills_before_returning` verifies
the rollout metadata row exists before startup init returns.
- `state_db::tests::try_init_waits_for_concurrent_startup_backfill`
verifies startup waits for another worker to finish backfill instead of
disabling the handle for the process.
-
`state_db::tests::try_init_times_out_waiting_for_stuck_startup_backfill`
verifies startup does not hang indefinitely on a stuck backfill lease.
-
`tests::find_thread_path_accepts_existing_state_db_path_without_canonical_filename`
verifies DB-backed lookup accepts valid existing rollout paths even when
the filename does not include the thread UUID.
-
`tests::find_thread_path_falls_back_when_db_path_points_to_another_thread`
verifies DB-backed lookup ignores a stale row whose existing path
belongs to another thread and read-repairs the row after filesystem
fallback.

Focused coverage updated in `codex-core`:

- `rollout_list_find::find_prefers_sqlite_path_by_id` now uses a
DB-preferred rollout file with matching `session_meta.id`, so it still
verifies that valid SQLite paths win without depending on stale/empty
rollout contents.

`cargo test -p codex-app-server thread_list_respects_search_term_filter
-- --test-threads=1 --nocapture` was attempted locally but timed out
waiting for the app-server test harness `initialize` response before
reaching the changed thread-list code path.

`bazel test //codex-rs/thread-store:thread-store-unit-tests
--test_output=errors` was attempted locally after the thread-store fix,
but this container failed before target analysis while fetching `v8+`
through BuildBuddy/direct GitHub. The equivalent local crate coverage,
including `cargo test -p codex-thread-store`, passes.

A plain local `cargo check -p codex-rollout -p codex-app-server --tests`
also requires system `libcap.pc` for `codex-linux-sandbox`; the
follow-up app-server check above used `CODEX_SKIP_VENDORED_BWRAP=1` in
this container.
2026-05-04 11:46:03 -07:00

459 lines
16 KiB
Rust

use crate::ARCHIVED_SESSIONS_SUBDIR;
use crate::SESSIONS_SUBDIR;
use crate::list;
use crate::list::parse_timestamp_uuid_from_filename;
use crate::recorder::RolloutRecorder;
use crate::state_db::normalize_cwd_for_state_db;
use chrono::DateTime;
use chrono::NaiveDateTime;
use chrono::Timelike;
use chrono::Utc;
use codex_protocol::ThreadId;
use codex_protocol::protocol::AskForApproval;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::SandboxPolicy;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::SessionSource;
use codex_state::BackfillState;
use codex_state::BackfillStats;
use codex_state::BackfillStatus;
use codex_state::DB_ERROR_METRIC;
use codex_state::DB_METRIC_BACKFILL;
use codex_state::DB_METRIC_BACKFILL_DURATION_MS;
use codex_state::ExtractionOutcome;
use codex_state::ThreadMetadataBuilder;
use codex_state::apply_rollout_item;
use std::path::Path;
use std::path::PathBuf;
use tracing::info;
use tracing::warn;
const ROLLOUT_PREFIX: &str = "rollout-";
const ROLLOUT_SUFFIX: &str = ".jsonl";
const BACKFILL_BATCH_SIZE: usize = 200;
#[cfg(not(test))]
const BACKFILL_LEASE_SECONDS: i64 = 900;
#[cfg(test)]
const BACKFILL_LEASE_SECONDS: i64 = 1;
pub(crate) fn builder_from_session_meta(
session_meta: &SessionMetaLine,
rollout_path: &Path,
) -> Option<ThreadMetadataBuilder> {
let created_at = parse_timestamp_to_utc(session_meta.meta.timestamp.as_str())?;
let mut builder = ThreadMetadataBuilder::new(
session_meta.meta.id,
rollout_path.to_path_buf(),
created_at,
session_meta.meta.source.clone(),
);
builder.model_provider = session_meta.meta.model_provider.clone();
builder.agent_nickname = session_meta.meta.agent_nickname.clone();
builder.agent_role = session_meta.meta.agent_role.clone();
builder.agent_path = session_meta.meta.agent_path.clone();
builder.cwd = session_meta.meta.cwd.clone();
builder.cli_version = Some(session_meta.meta.cli_version.clone());
builder.sandbox_policy = SandboxPolicy::new_read_only_policy();
builder.approval_mode = AskForApproval::OnRequest;
if let Some(git) = session_meta.git.as_ref() {
builder.git_sha = git.commit_hash.as_ref().map(|sha| sha.0.clone());
builder.git_branch = git.branch.clone();
builder.git_origin_url = git.repository_url.clone();
}
Some(builder)
}
pub fn builder_from_items(
items: &[RolloutItem],
rollout_path: &Path,
) -> Option<ThreadMetadataBuilder> {
if let Some(session_meta) = items.iter().find_map(|item| match item {
RolloutItem::SessionMeta(meta_line) => Some(meta_line),
RolloutItem::ResponseItem(_)
| RolloutItem::Compacted(_)
| RolloutItem::TurnContext(_)
| RolloutItem::EventMsg(_) => None,
}) && let Some(builder) = builder_from_session_meta(session_meta, rollout_path)
{
return Some(builder);
}
let file_name = rollout_path.file_name()?.to_str()?;
if !file_name.starts_with(ROLLOUT_PREFIX) || !file_name.ends_with(ROLLOUT_SUFFIX) {
return None;
}
let (created_ts, uuid) = parse_timestamp_uuid_from_filename(file_name)?;
let created_at =
DateTime::<Utc>::from_timestamp(created_ts.unix_timestamp(), 0)?.with_nanosecond(0)?;
let id = ThreadId::from_string(&uuid.to_string()).ok()?;
Some(ThreadMetadataBuilder::new(
id,
rollout_path.to_path_buf(),
created_at,
SessionSource::default(),
))
}
pub async fn extract_metadata_from_rollout(
rollout_path: &Path,
default_provider: &str,
) -> anyhow::Result<ExtractionOutcome> {
let (items, _thread_id, parse_errors) =
RolloutRecorder::load_rollout_items(rollout_path).await?;
if items.is_empty() {
return Err(anyhow::anyhow!(
"empty session file: {}",
rollout_path.display()
));
}
let builder = builder_from_items(items.as_slice(), rollout_path).ok_or_else(|| {
anyhow::anyhow!(
"rollout missing metadata builder: {}",
rollout_path.display()
)
})?;
let mut metadata = builder.build(default_provider);
for item in &items {
apply_rollout_item(&mut metadata, item, default_provider);
}
if let Some(updated_at) = file_modified_time_utc(rollout_path).await {
metadata.updated_at = updated_at;
}
Ok(ExtractionOutcome {
metadata,
memory_mode: items.iter().rev().find_map(|item| match item {
RolloutItem::SessionMeta(meta_line) => meta_line.meta.memory_mode.clone(),
RolloutItem::ResponseItem(_)
| RolloutItem::Compacted(_)
| RolloutItem::TurnContext(_)
| RolloutItem::EventMsg(_) => None,
}),
parse_errors,
})
}
pub(crate) async fn backfill_sessions(
runtime: &codex_state::StateRuntime,
codex_home: &Path,
default_provider: &str,
) {
backfill_sessions_with_lease(
runtime,
codex_home,
default_provider,
BACKFILL_LEASE_SECONDS,
)
.await;
}
pub(crate) async fn backfill_sessions_with_lease(
runtime: &codex_state::StateRuntime,
codex_home: &Path,
default_provider: &str,
backfill_lease_seconds: i64,
) {
let metric_client = codex_otel::global();
let timer = metric_client
.as_ref()
.and_then(|otel| otel.start_timer(DB_METRIC_BACKFILL_DURATION_MS, &[]).ok());
let backfill_state = match runtime.get_backfill_state().await {
Ok(state) => state,
Err(err) => {
warn!(
"failed to read backfill state at {}: {err}",
codex_home.display()
);
BackfillState::default()
}
};
if backfill_state.status == BackfillStatus::Complete {
return;
}
let claimed = match runtime.try_claim_backfill(backfill_lease_seconds).await {
Ok(claimed) => claimed,
Err(err) => {
warn!(
"failed to claim backfill worker at {}: {err}",
codex_home.display()
);
return;
}
};
if !claimed {
info!(
"state db backfill already running at {}; skipping duplicate worker",
codex_home.display()
);
return;
}
let mut backfill_state = match runtime.get_backfill_state().await {
Ok(state) => state,
Err(err) => {
warn!(
"failed to read claimed backfill state at {}: {err}",
codex_home.display()
);
BackfillState {
status: BackfillStatus::Running,
..Default::default()
}
}
};
if backfill_state.status != BackfillStatus::Running {
if let Err(err) = runtime.mark_backfill_running().await {
warn!(
"failed to mark backfill running at {}: {err}",
codex_home.display()
);
} else {
backfill_state.status = BackfillStatus::Running;
}
}
let sessions_root = codex_home.join(SESSIONS_SUBDIR);
let archived_root = codex_home.join(ARCHIVED_SESSIONS_SUBDIR);
let mut rollout_paths: Vec<BackfillRolloutPath> = Vec::new();
for (root, archived) in [(sessions_root, false), (archived_root, true)] {
if !tokio::fs::try_exists(&root).await.unwrap_or(false) {
continue;
}
match collect_rollout_paths(&root).await {
Ok(paths) => {
rollout_paths.extend(paths.into_iter().map(|path| BackfillRolloutPath {
watermark: backfill_watermark_for_path(codex_home, &path),
path,
archived,
}));
}
Err(err) => {
warn!(
"failed to collect rollout paths under {}: {err}",
root.display()
);
}
}
}
rollout_paths.sort_by(|a, b| a.watermark.cmp(&b.watermark));
if let Some(last_watermark) = backfill_state.last_watermark.as_deref() {
rollout_paths.retain(|entry| entry.watermark.as_str() > last_watermark);
}
let mut stats = BackfillStats {
scanned: 0,
upserted: 0,
failed: 0,
};
let mut last_watermark = backfill_state.last_watermark.clone();
for batch in rollout_paths.chunks(BACKFILL_BATCH_SIZE) {
for rollout in batch {
stats.scanned = stats.scanned.saturating_add(1);
match extract_metadata_from_rollout(&rollout.path, default_provider).await {
Ok(outcome) => {
if outcome.parse_errors > 0
&& let Some(ref metric_client) = metric_client
{
let _ = metric_client.counter(
DB_ERROR_METRIC,
outcome.parse_errors as i64,
&[("stage", "backfill_sessions")],
);
}
let mut metadata = outcome.metadata;
metadata.cwd = normalize_cwd_for_state_db(&metadata.cwd);
let memory_mode = outcome.memory_mode.unwrap_or_else(|| "enabled".to_string());
if let Ok(Some(existing_metadata)) = runtime.get_thread(metadata.id).await {
metadata.prefer_existing_git_info(&existing_metadata);
}
if rollout.archived && metadata.archived_at.is_none() {
let fallback_archived_at = metadata.updated_at;
metadata.archived_at = file_modified_time_utc(&rollout.path)
.await
.or(Some(fallback_archived_at));
}
if let Err(err) = runtime.upsert_thread(&metadata).await {
stats.failed = stats.failed.saturating_add(1);
warn!("failed to upsert rollout {}: {err}", rollout.path.display());
} else {
if let Err(err) = runtime
.set_thread_memory_mode(metadata.id, memory_mode.as_str())
.await
{
stats.failed = stats.failed.saturating_add(1);
warn!(
"failed to restore memory mode for {}: {err}",
rollout.path.display()
);
continue;
}
stats.upserted = stats.upserted.saturating_add(1);
if let Ok(meta_line) = list::read_session_meta_line(&rollout.path).await {
if let Err(err) = runtime
.persist_dynamic_tools(
meta_line.meta.id,
meta_line.meta.dynamic_tools.as_deref(),
)
.await
{
warn!(
"failed to backfill dynamic tools {}: {err}",
rollout.path.display()
);
}
} else {
warn!(
"failed to read session meta for dynamic tools {}",
rollout.path.display()
);
}
}
}
Err(err) => {
stats.failed = stats.failed.saturating_add(1);
warn!(
"failed to extract rollout {}: {err}",
rollout.path.display()
);
}
}
}
if let Some(last_entry) = batch.last() {
if let Err(err) = runtime
.checkpoint_backfill(last_entry.watermark.as_str())
.await
{
warn!(
"failed to checkpoint backfill at {}: {err}",
codex_home.display()
);
} else {
last_watermark = Some(last_entry.watermark.clone());
}
}
}
if let Err(err) = runtime
.mark_backfill_complete(last_watermark.as_deref())
.await
{
warn!(
"failed to mark backfill complete at {}: {err}",
codex_home.display()
);
}
info!(
"state db backfill scanned={}, upserted={}, failed={}",
stats.scanned, stats.upserted, stats.failed
);
if let Some(metric_client) = metric_client {
let _ = metric_client.counter(
DB_METRIC_BACKFILL,
stats.upserted as i64,
&[("status", "upserted")],
);
let _ = metric_client.counter(
DB_METRIC_BACKFILL,
stats.failed as i64,
&[("status", "failed")],
);
}
if let Some(timer) = timer.as_ref() {
let status = if stats.failed == 0 {
"success"
} else if stats.upserted == 0 {
"failed"
} else {
"partial_failure"
};
let _ = timer.record(&[("status", status)]);
}
}
#[derive(Debug, Clone)]
struct BackfillRolloutPath {
watermark: String,
path: PathBuf,
archived: bool,
}
fn backfill_watermark_for_path(codex_home: &Path, path: &Path) -> String {
path.strip_prefix(codex_home)
.unwrap_or(path)
.to_string_lossy()
.replace('\\', "/")
}
async fn file_modified_time_utc(path: &Path) -> Option<DateTime<Utc>> {
let modified = tokio::fs::metadata(path).await.ok()?.modified().ok()?;
let updated_at: DateTime<Utc> = modified.into();
Some(updated_at)
}
fn parse_timestamp_to_utc(ts: &str) -> Option<DateTime<Utc>> {
const FILENAME_TS_FORMAT: &str = "%Y-%m-%dT%H-%M-%S";
if let Ok(naive) = NaiveDateTime::parse_from_str(ts, FILENAME_TS_FORMAT) {
let dt = DateTime::<Utc>::from_naive_utc_and_offset(naive, Utc);
return dt.with_nanosecond(0);
}
if let Ok(dt) = DateTime::parse_from_rfc3339(ts) {
return Some(dt.with_timezone(&Utc));
}
None
}
async fn collect_rollout_paths(root: &Path) -> std::io::Result<Vec<PathBuf>> {
let mut stack = vec![root.to_path_buf()];
let mut paths = Vec::new();
while let Some(dir) = stack.pop() {
let mut read_dir = match tokio::fs::read_dir(&dir).await {
Ok(read_dir) => read_dir,
Err(err) => {
warn!("failed to read directory {}: {err}", dir.display());
continue;
}
};
loop {
let next_entry = match read_dir.next_entry().await {
Ok(next_entry) => next_entry,
Err(err) => {
warn!(
"failed to read directory entry under {}: {err}",
dir.display()
);
continue;
}
};
let Some(entry) = next_entry else {
break;
};
let path = entry.path();
let file_type = match entry.file_type().await {
Ok(file_type) => file_type,
Err(err) => {
warn!("failed to read file type for {}: {err}", path.display());
continue;
}
};
if file_type.is_dir() {
stack.push(path);
continue;
}
if !file_type.is_file() {
continue;
}
let file_name = entry.file_name();
let Some(name) = file_name.to_str() else {
continue;
};
if name.starts_with(ROLLOUT_PREFIX) && name.ends_with(ROLLOUT_SUFFIX) {
paths.push(path);
}
}
}
Ok(paths)
}
#[cfg(test)]
#[path = "metadata_tests.rs"]
mod tests;