tui: recover local state db startup failures (#22734)

## Why

#22580 made app-server startup fail when the local SQLite state database
cannot be initialized. Embedded/local TUI startup still continued on the
permissive path, which left the CLI inconsistent and could hide a real
startup problem behind unrelated UI. This brings local TUI startup onto
the same fail-closed behavior while keeping recovery humane for the two
failure modes we are seeing in practice: damaged database files and
startup stalls caused by another process holding the database write
lock.

## What changed

- Embedded TUI startup now uses `state_db::try_init(...)` and returns a
typed `LocalStateDbStartupError` that preserves the affected database
path plus the underlying failure detail.
- CLI startup handles that failure before entering the interactive TUI:
- lock-contention failures tell users to quit other Codex processes and
try again
- failures consistent with a broken local database offer a safe repair
that backs up Codex-owned SQLite files, rebuilds local database files,
and retries startup once
- declined or unsuccessful repairs print concise guidance plus technical
details
- Shared startup error plumbing lives in `tui/src/startup_error.rs`,
while CLI recovery policy and focused recovery tests live in
`cli/src/state_db_recovery.rs`.

## Verification

- `cargo test -p codex-tui
embedded_state_db_failure_is_typed_for_cli_recovery`
- `cargo test -p codex-cli state_db_recovery`
- Manually held an exclusive SQLite lock on `state_5.sqlite` and
confirmed the CLI shows lock-specific guidance without offering repair.
- Manually exercised the repair path with a deliberately invalid
`sqlite_home` and confirmed it backs up the blocking path and resumes
startup.
This commit is contained in:
Eric Traut
2026-05-14 18:51:36 -07:00
committed by GitHub
parent 3c6d727810
commit 3a23e87e20
6 changed files with 310 additions and 16 deletions

View File

@@ -52,6 +52,7 @@ mod doctor;
mod marketplace_cmd;
mod mcp_cmd;
mod plugin_cmd;
mod state_db_recovery;
#[cfg(not(windows))]
mod wsl_paths;
@@ -59,6 +60,7 @@ use crate::mcp_cmd::McpCli;
use crate::plugin_cmd::PluginCli;
use crate::plugin_cmd::PluginSubcommand;
use doctor::DoctorCommand;
use state_db_recovery as local_state_db;
use codex_config::LoaderOverrides;
use codex_core::build_models_manager;
@@ -1980,13 +1982,47 @@ async fn run_interactive_tui(
};
*slot = Some(auth_token);
}
codex_tui::run_main(
interactive,
arg0_paths,
codex_config::LoaderOverrides::default(),
remote_endpoint,
)
.await
let start_tui = || {
codex_tui::run_main(
interactive.clone(),
arg0_paths.clone(),
codex_config::LoaderOverrides::default(),
remote_endpoint.clone(),
)
};
let mut attempted_repair = false;
loop {
let err = match start_tui().await {
Ok(exit_info) => return Ok(exit_info),
Err(err) => err,
};
let Some(startup_error) = local_state_db::startup_error(&err) else {
return Err(err);
};
if local_state_db::is_locked(startup_error.detail()) {
local_state_db::print_locked_guidance(startup_error);
return Ok(AppExitInfo::fatal(startup_error.to_string()));
}
if attempted_repair {
local_state_db::print_diagnostic_guidance(startup_error);
return Ok(AppExitInfo::fatal(startup_error.to_string()));
}
if !local_state_db::confirm_repair(startup_error)? {
local_state_db::print_diagnostic_guidance(startup_error);
return Ok(AppExitInfo::fatal(startup_error.to_string()));
}
match local_state_db::repair_files(startup_error).await {
Ok(backups) => local_state_db::print_repair_backups(&backups),
Err(repair_err) => {
local_state_db::print_diagnostic_guidance(startup_error);
return Ok(AppExitInfo::fatal(format!(
"failed to repair Codex local data automatically: {repair_err}"
)));
}
}
attempted_repair = true;
}
}
fn confirm(prompt: &str) -> std::io::Result<bool> {

View File

@@ -0,0 +1,183 @@
//! CLI recovery for local state database startup failures.
//!
//! This keeps user-facing repair and lock-contention handling out of the main
//! CLI dispatch path while preserving the TUI startup error as the boundary type.
use codex_tui::LocalStateDbStartupError;
use std::path::PathBuf;
pub(crate) fn startup_error(err: &std::io::Error) -> Option<&LocalStateDbStartupError> {
err.get_ref()
.and_then(|err| err.downcast_ref::<LocalStateDbStartupError>())
}
pub(crate) fn is_locked(detail: &str) -> bool {
let detail = detail.to_ascii_lowercase();
detail.contains("database is locked") || detail.contains("database is busy")
}
pub(crate) fn confirm_repair(startup_error: &LocalStateDbStartupError) -> std::io::Result<bool> {
eprintln!("Codex couldn't start because its local database appears to be damaged.");
eprintln!("Codex can try a safe repair by backing up those files and rebuilding them.");
print_technical_details(startup_error);
crate::confirm("Repair Codex local data now? [y/N]: ")
}
pub(crate) async fn repair_files(
startup_error: &LocalStateDbStartupError,
) -> std::io::Result<Vec<PathBuf>> {
let state_db_path = startup_error.state_db_path();
let sqlite_home = state_db_path.parent().ok_or_else(|| {
std::io::Error::other("state database path does not have a parent directory")
})?;
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map_or(0, |duration| duration.as_secs());
let repair_suffix = format!("codex-repair-{timestamp}");
let mut backups = Vec::new();
match tokio::fs::metadata(sqlite_home).await {
Ok(metadata) if metadata.is_dir() => {}
Ok(_) => {
backups.push(backup_path(sqlite_home, &repair_suffix).await?);
tokio::fs::create_dir_all(sqlite_home).await?;
}
Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
tokio::fs::create_dir_all(sqlite_home).await?;
}
Err(err) => return Err(err),
}
let logs_db_path = codex_state::logs_db_path(sqlite_home);
for path in sqlite_paths(state_db_path)
.into_iter()
.chain(sqlite_paths(logs_db_path.as_path()))
{
if tokio::fs::try_exists(path.as_path()).await? {
backups.push(backup_path(path.as_path(), &repair_suffix).await?);
}
}
if backups.is_empty() {
return Err(std::io::Error::other(
"no repairable Codex local data files were found",
));
}
Ok(backups)
}
pub(crate) fn print_repair_backups(backups: &[PathBuf]) {
eprintln!("Backed up Codex local data before repair:");
for backup in backups {
eprintln!(" {}", backup.display());
}
eprintln!("Retrying startup with rebuilt local data...");
}
pub(crate) fn print_diagnostic_guidance(startup_error: &LocalStateDbStartupError) {
eprintln!("Codex couldn't start because its local database appears to be damaged.");
eprintln!("Run `codex doctor` to check your setup and get next-step guidance.");
eprintln!("If this keeps happening, share the technical details below when asking for help.");
print_technical_details(startup_error);
}
pub(crate) fn print_locked_guidance(startup_error: &LocalStateDbStartupError) {
eprintln!("Codex couldn't start because another Codex process is using its local data.");
eprintln!("Quit any other copies of Codex that may still be running, then try again.");
print_technical_details(startup_error);
}
fn sqlite_paths(db_path: &std::path::Path) -> Vec<PathBuf> {
let mut wal_path = db_path.as_os_str().to_os_string();
wal_path.push("-wal");
let mut shm_path = db_path.as_os_str().to_os_string();
shm_path.push("-shm");
vec![
db_path.to_path_buf(),
PathBuf::from(wal_path),
PathBuf::from(shm_path),
]
}
async fn backup_path(path: &std::path::Path, repair_suffix: &str) -> std::io::Result<PathBuf> {
let file_name = path.file_name().ok_or_else(|| {
std::io::Error::other(format!(
"cannot create a repair backup name for {}",
path.display()
))
})?;
let mut sequence = 0;
loop {
let mut backup_name = file_name.to_os_string();
backup_name.push(format!(".{repair_suffix}.{sequence}.bak"));
let backup_path = path.with_file_name(backup_name);
if !tokio::fs::try_exists(backup_path.as_path()).await? {
tokio::fs::rename(path, backup_path.as_path()).await?;
return Ok(backup_path);
}
sequence += 1;
}
}
fn print_technical_details(startup_error: &LocalStateDbStartupError) {
eprintln!("Technical details:");
eprintln!(" Location: {}", startup_error.state_db_path().display());
eprintln!(" Cause: {}", startup_error.detail());
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use tempfile::TempDir;
#[tokio::test]
async fn repair_backs_up_owned_database_files() -> std::io::Result<()> {
let temp_dir = TempDir::new()?;
let state_path = codex_state::state_db_path(temp_dir.path());
let logs_path = codex_state::logs_db_path(temp_dir.path());
let state_sidecars = sqlite_paths(state_path.as_path());
tokio::fs::write(state_path.as_path(), b"state").await?;
tokio::fs::write(state_sidecars[1].as_path(), b"state-wal").await?;
tokio::fs::write(logs_path.as_path(), b"logs").await?;
let startup_error =
LocalStateDbStartupError::new(state_path.clone(), "corrupt".to_string());
let backups = repair_files(&startup_error).await?;
assert_eq!(backups.len(), 3);
assert!(!tokio::fs::try_exists(state_path.as_path()).await?);
assert!(!tokio::fs::try_exists(state_sidecars[1].as_path()).await?);
assert!(!tokio::fs::try_exists(logs_path.as_path()).await?);
for backup in backups {
assert!(tokio::fs::try_exists(backup.as_path()).await?);
}
Ok(())
}
#[tokio::test]
async fn repair_replaces_blocking_sqlite_home_file() -> std::io::Result<()> {
let temp_dir = TempDir::new()?;
let sqlite_home = temp_dir.path().join("sqlite-home");
tokio::fs::write(sqlite_home.as_path(), b"not-a-directory").await?;
let startup_error = LocalStateDbStartupError::new(
codex_state::state_db_path(sqlite_home.as_path()),
"File exists".to_string(),
);
let backups = repair_files(&startup_error).await?;
assert_eq!(backups.len(), 1);
assert!(tokio::fs::metadata(sqlite_home.as_path()).await?.is_dir());
assert!(tokio::fs::try_exists(backups[0].as_path()).await?);
Ok(())
}
#[test]
fn lock_failures_skip_repair() {
assert!(is_locked("database is locked"));
assert!(is_locked("database is busy"));
assert!(!is_locked("database disk image is malformed"));
}
}

View File

@@ -5,7 +5,7 @@ use codex_utils_cli::ApprovalModeCliArg;
use codex_utils_cli::CliConfigOverrides;
use codex_utils_cli::SharedCliOptions;
#[derive(Parser, Debug)]
#[derive(Parser, Clone, Debug)]
#[command(version)]
pub struct Cli {
/// Optional user prompt to start the session.
@@ -89,7 +89,7 @@ impl std::ops::DerefMut for Cli {
}
}
#[derive(Debug, Default)]
#[derive(Clone, Debug, Default)]
pub struct TuiSharedCliOptions(SharedCliOptions);
impl TuiSharedCliOptions {

View File

@@ -14,6 +14,7 @@ use crate::legacy_core::format_exec_policy_error_with_source;
use crate::legacy_core::windows_sandbox::WindowsSandboxLevelExt;
use crate::session_resume::ResolveCwdOutcome;
use crate::session_resume::resolve_cwd_for_resume_or_fork;
pub use crate::startup_error::LocalStateDbStartupError;
use additional_dirs::add_dir_warning_message;
use app::App;
pub use app::AppExitInfo;
@@ -167,6 +168,7 @@ mod session_state;
mod shimmer;
mod skills_helpers;
mod slash_command;
mod startup_error;
mod startup_hooks_review;
mod status;
mod status_indicator_widget;
@@ -312,6 +314,21 @@ pub(crate) enum AppServerTarget {
Remote { endpoint: RemoteAppServerEndpoint },
}
async fn init_state_db_for_app_server_target(
config: &Config,
app_server_target: &AppServerTarget,
) -> std::io::Result<Option<StateDbHandle>> {
match app_server_target {
AppServerTarget::Embedded => state_db::try_init(config).await.map(Some).map_err(|err| {
std::io::Error::other(LocalStateDbStartupError::new(
codex_state::state_db_path(config.sqlite_home.as_path()),
err.to_string(),
))
}),
AppServerTarget::Remote { .. } => Ok(state_db::get_state_db(config).await),
}
}
fn remote_addr_has_explicit_port(addr: &str, parsed: &Url) -> bool {
let Some(host) = parsed.host_str() else {
return false;
@@ -509,7 +526,7 @@ pub(crate) async fn start_app_server_for_picker(
pub(crate) async fn start_embedded_app_server_for_picker(
config: &Config,
) -> color_eyre::Result<AppServerSession> {
let state_db = state_db::init(config).await;
let state_db = init_state_db_for_app_server_target(config, &AppServerTarget::Embedded).await?;
start_app_server_for_picker(
config,
&AppServerTarget::Embedded,
@@ -989,10 +1006,7 @@ pub async fn run_main(
otel.as_ref(),
otel_originator.as_str(),
);
let state_db = match &app_server_target {
AppServerTarget::Embedded => state_db::init(&config).await,
AppServerTarget::Remote { .. } => state_db::get_state_db(&config).await,
};
let state_db = init_state_db_for_app_server_target(&config, &app_server_target).await?;
let effective_toml = config.config_layer_stack.effective_config();
match effective_toml.try_into() {
@@ -1823,7 +1837,8 @@ mod tests {
async fn start_test_embedded_app_server(
config: Config,
) -> color_eyre::Result<InProcessAppServerClient> {
let state_db = state_db::init(&config).await;
let state_db =
init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await?;
start_embedded_app_server(
Arg0DispatchPaths::default(),
config,
@@ -2416,6 +2431,37 @@ mod tests {
);
Ok(())
}
#[tokio::test]
async fn embedded_state_db_failure_is_typed_for_cli_recovery() -> color_eyre::Result<()> {
let temp_dir = TempDir::new()?;
let mut config = build_config(&temp_dir).await?;
let occupied_sqlite_home = temp_dir.path().join("sqlite-home");
std::fs::write(&occupied_sqlite_home, "occupied")?;
config.sqlite_home = occupied_sqlite_home.clone();
let err =
match init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await {
Ok(_) => panic!("embedded startup should surface state db init failures"),
Err(err) => err,
};
let startup_error = err
.get_ref()
.and_then(|err| err.downcast_ref::<LocalStateDbStartupError>())
.expect("state db startup failure should retain its typed context");
assert_eq!(
startup_error.state_db_path(),
codex_state::state_db_path(occupied_sqlite_home.as_path()).as_path()
);
assert!(
startup_error
.detail()
.contains("failed to initialize state runtime"),
"startup error should preserve the underlying state db failure"
);
Ok(())
}
#[tokio::test]
#[serial]
async fn windows_shows_trust_prompt_with_sandbox() -> std::io::Result<()> {

View File

@@ -0,0 +1,29 @@
use std::path::Path;
use std::path::PathBuf;
#[derive(Debug, thiserror::Error)]
#[error(
"failed to initialize sqlite state db at {}: {detail}",
state_db_path.display()
)]
pub struct LocalStateDbStartupError {
state_db_path: PathBuf,
detail: String,
}
impl LocalStateDbStartupError {
pub fn new(state_db_path: PathBuf, detail: String) -> Self {
Self {
state_db_path,
detail,
}
}
pub fn state_db_path(&self) -> &Path {
self.state_db_path.as_path()
}
pub fn detail(&self) -> &str {
self.detail.as_str()
}
}

View File

@@ -5,7 +5,7 @@ use clap::Args;
use codex_protocol::config_types::ProfileV2Name;
use std::path::PathBuf;
#[derive(Args, Debug, Default)]
#[derive(Args, Clone, Debug, Default)]
pub struct SharedCliOptions {
/// Optional image(s) to attach to the initial prompt.
#[arg(