diff --git a/codex-rs/cli/src/main.rs b/codex-rs/cli/src/main.rs index de5f52f4e7..87ac5d6676 100644 --- a/codex-rs/cli/src/main.rs +++ b/codex-rs/cli/src/main.rs @@ -52,6 +52,7 @@ mod doctor; mod marketplace_cmd; mod mcp_cmd; mod plugin_cmd; +mod state_db_recovery; #[cfg(not(windows))] mod wsl_paths; @@ -59,6 +60,7 @@ use crate::mcp_cmd::McpCli; use crate::plugin_cmd::PluginCli; use crate::plugin_cmd::PluginSubcommand; use doctor::DoctorCommand; +use state_db_recovery as local_state_db; use codex_config::LoaderOverrides; use codex_core::build_models_manager; @@ -1980,13 +1982,47 @@ async fn run_interactive_tui( }; *slot = Some(auth_token); } - codex_tui::run_main( - interactive, - arg0_paths, - codex_config::LoaderOverrides::default(), - remote_endpoint, - ) - .await + let start_tui = || { + codex_tui::run_main( + interactive.clone(), + arg0_paths.clone(), + codex_config::LoaderOverrides::default(), + remote_endpoint.clone(), + ) + }; + let mut attempted_repair = false; + loop { + let err = match start_tui().await { + Ok(exit_info) => return Ok(exit_info), + Err(err) => err, + }; + let Some(startup_error) = local_state_db::startup_error(&err) else { + return Err(err); + }; + if local_state_db::is_locked(startup_error.detail()) { + local_state_db::print_locked_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + if attempted_repair { + local_state_db::print_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + if !local_state_db::confirm_repair(startup_error)? { + local_state_db::print_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + + match local_state_db::repair_files(startup_error).await { + Ok(backups) => local_state_db::print_repair_backups(&backups), + Err(repair_err) => { + local_state_db::print_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(format!( + "failed to repair Codex local data automatically: {repair_err}" + ))); + } + } + attempted_repair = true; + } } fn confirm(prompt: &str) -> std::io::Result { diff --git a/codex-rs/cli/src/state_db_recovery.rs b/codex-rs/cli/src/state_db_recovery.rs new file mode 100644 index 0000000000..8db134540a --- /dev/null +++ b/codex-rs/cli/src/state_db_recovery.rs @@ -0,0 +1,183 @@ +//! CLI recovery for local state database startup failures. +//! +//! This keeps user-facing repair and lock-contention handling out of the main +//! CLI dispatch path while preserving the TUI startup error as the boundary type. + +use codex_tui::LocalStateDbStartupError; +use std::path::PathBuf; + +pub(crate) fn startup_error(err: &std::io::Error) -> Option<&LocalStateDbStartupError> { + err.get_ref() + .and_then(|err| err.downcast_ref::()) +} + +pub(crate) fn is_locked(detail: &str) -> bool { + let detail = detail.to_ascii_lowercase(); + detail.contains("database is locked") || detail.contains("database is busy") +} + +pub(crate) fn confirm_repair(startup_error: &LocalStateDbStartupError) -> std::io::Result { + eprintln!("Codex couldn't start because its local database appears to be damaged."); + eprintln!("Codex can try a safe repair by backing up those files and rebuilding them."); + print_technical_details(startup_error); + crate::confirm("Repair Codex local data now? [y/N]: ") +} + +pub(crate) async fn repair_files( + startup_error: &LocalStateDbStartupError, +) -> std::io::Result> { + let state_db_path = startup_error.state_db_path(); + let sqlite_home = state_db_path.parent().ok_or_else(|| { + std::io::Error::other("state database path does not have a parent directory") + })?; + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_or(0, |duration| duration.as_secs()); + let repair_suffix = format!("codex-repair-{timestamp}"); + let mut backups = Vec::new(); + + match tokio::fs::metadata(sqlite_home).await { + Ok(metadata) if metadata.is_dir() => {} + Ok(_) => { + backups.push(backup_path(sqlite_home, &repair_suffix).await?); + tokio::fs::create_dir_all(sqlite_home).await?; + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + tokio::fs::create_dir_all(sqlite_home).await?; + } + Err(err) => return Err(err), + } + + let logs_db_path = codex_state::logs_db_path(sqlite_home); + for path in sqlite_paths(state_db_path) + .into_iter() + .chain(sqlite_paths(logs_db_path.as_path())) + { + if tokio::fs::try_exists(path.as_path()).await? { + backups.push(backup_path(path.as_path(), &repair_suffix).await?); + } + } + + if backups.is_empty() { + return Err(std::io::Error::other( + "no repairable Codex local data files were found", + )); + } + + Ok(backups) +} + +pub(crate) fn print_repair_backups(backups: &[PathBuf]) { + eprintln!("Backed up Codex local data before repair:"); + for backup in backups { + eprintln!(" {}", backup.display()); + } + eprintln!("Retrying startup with rebuilt local data..."); +} + +pub(crate) fn print_diagnostic_guidance(startup_error: &LocalStateDbStartupError) { + eprintln!("Codex couldn't start because its local database appears to be damaged."); + eprintln!("Run `codex doctor` to check your setup and get next-step guidance."); + eprintln!("If this keeps happening, share the technical details below when asking for help."); + print_technical_details(startup_error); +} + +pub(crate) fn print_locked_guidance(startup_error: &LocalStateDbStartupError) { + eprintln!("Codex couldn't start because another Codex process is using its local data."); + eprintln!("Quit any other copies of Codex that may still be running, then try again."); + print_technical_details(startup_error); +} + +fn sqlite_paths(db_path: &std::path::Path) -> Vec { + let mut wal_path = db_path.as_os_str().to_os_string(); + wal_path.push("-wal"); + let mut shm_path = db_path.as_os_str().to_os_string(); + shm_path.push("-shm"); + vec![ + db_path.to_path_buf(), + PathBuf::from(wal_path), + PathBuf::from(shm_path), + ] +} + +async fn backup_path(path: &std::path::Path, repair_suffix: &str) -> std::io::Result { + let file_name = path.file_name().ok_or_else(|| { + std::io::Error::other(format!( + "cannot create a repair backup name for {}", + path.display() + )) + })?; + let mut sequence = 0; + loop { + let mut backup_name = file_name.to_os_string(); + backup_name.push(format!(".{repair_suffix}.{sequence}.bak")); + let backup_path = path.with_file_name(backup_name); + if !tokio::fs::try_exists(backup_path.as_path()).await? { + tokio::fs::rename(path, backup_path.as_path()).await?; + return Ok(backup_path); + } + sequence += 1; + } +} + +fn print_technical_details(startup_error: &LocalStateDbStartupError) { + eprintln!("Technical details:"); + eprintln!(" Location: {}", startup_error.state_db_path().display()); + eprintln!(" Cause: {}", startup_error.detail()); +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + use tempfile::TempDir; + + #[tokio::test] + async fn repair_backs_up_owned_database_files() -> std::io::Result<()> { + let temp_dir = TempDir::new()?; + let state_path = codex_state::state_db_path(temp_dir.path()); + let logs_path = codex_state::logs_db_path(temp_dir.path()); + let state_sidecars = sqlite_paths(state_path.as_path()); + tokio::fs::write(state_path.as_path(), b"state").await?; + tokio::fs::write(state_sidecars[1].as_path(), b"state-wal").await?; + tokio::fs::write(logs_path.as_path(), b"logs").await?; + + let startup_error = + LocalStateDbStartupError::new(state_path.clone(), "corrupt".to_string()); + let backups = repair_files(&startup_error).await?; + + assert_eq!(backups.len(), 3); + assert!(!tokio::fs::try_exists(state_path.as_path()).await?); + assert!(!tokio::fs::try_exists(state_sidecars[1].as_path()).await?); + assert!(!tokio::fs::try_exists(logs_path.as_path()).await?); + for backup in backups { + assert!(tokio::fs::try_exists(backup.as_path()).await?); + } + Ok(()) + } + + #[tokio::test] + async fn repair_replaces_blocking_sqlite_home_file() -> std::io::Result<()> { + let temp_dir = TempDir::new()?; + let sqlite_home = temp_dir.path().join("sqlite-home"); + tokio::fs::write(sqlite_home.as_path(), b"not-a-directory").await?; + let startup_error = LocalStateDbStartupError::new( + codex_state::state_db_path(sqlite_home.as_path()), + "File exists".to_string(), + ); + + let backups = repair_files(&startup_error).await?; + + assert_eq!(backups.len(), 1); + assert!(tokio::fs::metadata(sqlite_home.as_path()).await?.is_dir()); + assert!(tokio::fs::try_exists(backups[0].as_path()).await?); + Ok(()) + } + + #[test] + fn lock_failures_skip_repair() { + assert!(is_locked("database is locked")); + assert!(is_locked("database is busy")); + assert!(!is_locked("database disk image is malformed")); + } +} diff --git a/codex-rs/tui/src/cli.rs b/codex-rs/tui/src/cli.rs index 4ec69bf26f..13ddffb14a 100644 --- a/codex-rs/tui/src/cli.rs +++ b/codex-rs/tui/src/cli.rs @@ -5,7 +5,7 @@ use codex_utils_cli::ApprovalModeCliArg; use codex_utils_cli::CliConfigOverrides; use codex_utils_cli::SharedCliOptions; -#[derive(Parser, Debug)] +#[derive(Parser, Clone, Debug)] #[command(version)] pub struct Cli { /// Optional user prompt to start the session. @@ -89,7 +89,7 @@ impl std::ops::DerefMut for Cli { } } -#[derive(Debug, Default)] +#[derive(Clone, Debug, Default)] pub struct TuiSharedCliOptions(SharedCliOptions); impl TuiSharedCliOptions { diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs index c3ec776206..69b6bec0f3 100644 --- a/codex-rs/tui/src/lib.rs +++ b/codex-rs/tui/src/lib.rs @@ -14,6 +14,7 @@ use crate::legacy_core::format_exec_policy_error_with_source; use crate::legacy_core::windows_sandbox::WindowsSandboxLevelExt; use crate::session_resume::ResolveCwdOutcome; use crate::session_resume::resolve_cwd_for_resume_or_fork; +pub use crate::startup_error::LocalStateDbStartupError; use additional_dirs::add_dir_warning_message; use app::App; pub use app::AppExitInfo; @@ -167,6 +168,7 @@ mod session_state; mod shimmer; mod skills_helpers; mod slash_command; +mod startup_error; mod startup_hooks_review; mod status; mod status_indicator_widget; @@ -312,6 +314,21 @@ pub(crate) enum AppServerTarget { Remote { endpoint: RemoteAppServerEndpoint }, } +async fn init_state_db_for_app_server_target( + config: &Config, + app_server_target: &AppServerTarget, +) -> std::io::Result> { + match app_server_target { + AppServerTarget::Embedded => state_db::try_init(config).await.map(Some).map_err(|err| { + std::io::Error::other(LocalStateDbStartupError::new( + codex_state::state_db_path(config.sqlite_home.as_path()), + err.to_string(), + )) + }), + AppServerTarget::Remote { .. } => Ok(state_db::get_state_db(config).await), + } +} + fn remote_addr_has_explicit_port(addr: &str, parsed: &Url) -> bool { let Some(host) = parsed.host_str() else { return false; @@ -509,7 +526,7 @@ pub(crate) async fn start_app_server_for_picker( pub(crate) async fn start_embedded_app_server_for_picker( config: &Config, ) -> color_eyre::Result { - let state_db = state_db::init(config).await; + let state_db = init_state_db_for_app_server_target(config, &AppServerTarget::Embedded).await?; start_app_server_for_picker( config, &AppServerTarget::Embedded, @@ -989,10 +1006,7 @@ pub async fn run_main( otel.as_ref(), otel_originator.as_str(), ); - let state_db = match &app_server_target { - AppServerTarget::Embedded => state_db::init(&config).await, - AppServerTarget::Remote { .. } => state_db::get_state_db(&config).await, - }; + let state_db = init_state_db_for_app_server_target(&config, &app_server_target).await?; let effective_toml = config.config_layer_stack.effective_config(); match effective_toml.try_into() { @@ -1823,7 +1837,8 @@ mod tests { async fn start_test_embedded_app_server( config: Config, ) -> color_eyre::Result { - let state_db = state_db::init(&config).await; + let state_db = + init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await?; start_embedded_app_server( Arg0DispatchPaths::default(), config, @@ -2416,6 +2431,37 @@ mod tests { ); Ok(()) } + + #[tokio::test] + async fn embedded_state_db_failure_is_typed_for_cli_recovery() -> color_eyre::Result<()> { + let temp_dir = TempDir::new()?; + let mut config = build_config(&temp_dir).await?; + let occupied_sqlite_home = temp_dir.path().join("sqlite-home"); + std::fs::write(&occupied_sqlite_home, "occupied")?; + config.sqlite_home = occupied_sqlite_home.clone(); + + let err = + match init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await { + Ok(_) => panic!("embedded startup should surface state db init failures"), + Err(err) => err, + }; + let startup_error = err + .get_ref() + .and_then(|err| err.downcast_ref::()) + .expect("state db startup failure should retain its typed context"); + + assert_eq!( + startup_error.state_db_path(), + codex_state::state_db_path(occupied_sqlite_home.as_path()).as_path() + ); + assert!( + startup_error + .detail() + .contains("failed to initialize state runtime"), + "startup error should preserve the underlying state db failure" + ); + Ok(()) + } #[tokio::test] #[serial] async fn windows_shows_trust_prompt_with_sandbox() -> std::io::Result<()> { diff --git a/codex-rs/tui/src/startup_error.rs b/codex-rs/tui/src/startup_error.rs new file mode 100644 index 0000000000..55b691d836 --- /dev/null +++ b/codex-rs/tui/src/startup_error.rs @@ -0,0 +1,29 @@ +use std::path::Path; +use std::path::PathBuf; + +#[derive(Debug, thiserror::Error)] +#[error( + "failed to initialize sqlite state db at {}: {detail}", + state_db_path.display() +)] +pub struct LocalStateDbStartupError { + state_db_path: PathBuf, + detail: String, +} + +impl LocalStateDbStartupError { + pub fn new(state_db_path: PathBuf, detail: String) -> Self { + Self { + state_db_path, + detail, + } + } + + pub fn state_db_path(&self) -> &Path { + self.state_db_path.as_path() + } + + pub fn detail(&self) -> &str { + self.detail.as_str() + } +} diff --git a/codex-rs/utils/cli/src/shared_options.rs b/codex-rs/utils/cli/src/shared_options.rs index 8735cb6db8..b59a3b5c85 100644 --- a/codex-rs/utils/cli/src/shared_options.rs +++ b/codex-rs/utils/cli/src/shared_options.rs @@ -5,7 +5,7 @@ use clap::Args; use codex_protocol::config_types::ProfileV2Name; use std::path::PathBuf; -#[derive(Args, Debug, Default)] +#[derive(Args, Clone, Debug, Default)] pub struct SharedCliOptions { /// Optional image(s) to attach to the initial prompt. #[arg(