mirror of
https://github.com/openai/codex.git
synced 2026-05-15 16:53:05 +00:00
tui: recover local state db startup failures (#22734)
## Why #22580 made app-server startup fail when the local SQLite state database cannot be initialized. Embedded/local TUI startup still continued on the permissive path, which left the CLI inconsistent and could hide a real startup problem behind unrelated UI. This brings local TUI startup onto the same fail-closed behavior while keeping recovery humane for the two failure modes we are seeing in practice: damaged database files and startup stalls caused by another process holding the database write lock. ## What changed - Embedded TUI startup now uses `state_db::try_init(...)` and returns a typed `LocalStateDbStartupError` that preserves the affected database path plus the underlying failure detail. - CLI startup handles that failure before entering the interactive TUI: - lock-contention failures tell users to quit other Codex processes and try again - failures consistent with a broken local database offer a safe repair that backs up Codex-owned SQLite files, rebuilds local database files, and retries startup once - declined or unsuccessful repairs print concise guidance plus technical details - Shared startup error plumbing lives in `tui/src/startup_error.rs`, while CLI recovery policy and focused recovery tests live in `cli/src/state_db_recovery.rs`. ## Verification - `cargo test -p codex-tui embedded_state_db_failure_is_typed_for_cli_recovery` - `cargo test -p codex-cli state_db_recovery` - Manually held an exclusive SQLite lock on `state_5.sqlite` and confirmed the CLI shows lock-specific guidance without offering repair. - Manually exercised the repair path with a deliberately invalid `sqlite_home` and confirmed it backs up the blocking path and resumes startup.
This commit is contained in:
@@ -52,6 +52,7 @@ mod doctor;
|
||||
mod marketplace_cmd;
|
||||
mod mcp_cmd;
|
||||
mod plugin_cmd;
|
||||
mod state_db_recovery;
|
||||
#[cfg(not(windows))]
|
||||
mod wsl_paths;
|
||||
|
||||
@@ -59,6 +60,7 @@ use crate::mcp_cmd::McpCli;
|
||||
use crate::plugin_cmd::PluginCli;
|
||||
use crate::plugin_cmd::PluginSubcommand;
|
||||
use doctor::DoctorCommand;
|
||||
use state_db_recovery as local_state_db;
|
||||
|
||||
use codex_config::LoaderOverrides;
|
||||
use codex_core::build_models_manager;
|
||||
@@ -1980,13 +1982,47 @@ async fn run_interactive_tui(
|
||||
};
|
||||
*slot = Some(auth_token);
|
||||
}
|
||||
codex_tui::run_main(
|
||||
interactive,
|
||||
arg0_paths,
|
||||
codex_config::LoaderOverrides::default(),
|
||||
remote_endpoint,
|
||||
)
|
||||
.await
|
||||
let start_tui = || {
|
||||
codex_tui::run_main(
|
||||
interactive.clone(),
|
||||
arg0_paths.clone(),
|
||||
codex_config::LoaderOverrides::default(),
|
||||
remote_endpoint.clone(),
|
||||
)
|
||||
};
|
||||
let mut attempted_repair = false;
|
||||
loop {
|
||||
let err = match start_tui().await {
|
||||
Ok(exit_info) => return Ok(exit_info),
|
||||
Err(err) => err,
|
||||
};
|
||||
let Some(startup_error) = local_state_db::startup_error(&err) else {
|
||||
return Err(err);
|
||||
};
|
||||
if local_state_db::is_locked(startup_error.detail()) {
|
||||
local_state_db::print_locked_guidance(startup_error);
|
||||
return Ok(AppExitInfo::fatal(startup_error.to_string()));
|
||||
}
|
||||
if attempted_repair {
|
||||
local_state_db::print_diagnostic_guidance(startup_error);
|
||||
return Ok(AppExitInfo::fatal(startup_error.to_string()));
|
||||
}
|
||||
if !local_state_db::confirm_repair(startup_error)? {
|
||||
local_state_db::print_diagnostic_guidance(startup_error);
|
||||
return Ok(AppExitInfo::fatal(startup_error.to_string()));
|
||||
}
|
||||
|
||||
match local_state_db::repair_files(startup_error).await {
|
||||
Ok(backups) => local_state_db::print_repair_backups(&backups),
|
||||
Err(repair_err) => {
|
||||
local_state_db::print_diagnostic_guidance(startup_error);
|
||||
return Ok(AppExitInfo::fatal(format!(
|
||||
"failed to repair Codex local data automatically: {repair_err}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
attempted_repair = true;
|
||||
}
|
||||
}
|
||||
|
||||
fn confirm(prompt: &str) -> std::io::Result<bool> {
|
||||
|
||||
183
codex-rs/cli/src/state_db_recovery.rs
Normal file
183
codex-rs/cli/src/state_db_recovery.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
//! CLI recovery for local state database startup failures.
|
||||
//!
|
||||
//! This keeps user-facing repair and lock-contention handling out of the main
|
||||
//! CLI dispatch path while preserving the TUI startup error as the boundary type.
|
||||
|
||||
use codex_tui::LocalStateDbStartupError;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub(crate) fn startup_error(err: &std::io::Error) -> Option<&LocalStateDbStartupError> {
|
||||
err.get_ref()
|
||||
.and_then(|err| err.downcast_ref::<LocalStateDbStartupError>())
|
||||
}
|
||||
|
||||
pub(crate) fn is_locked(detail: &str) -> bool {
|
||||
let detail = detail.to_ascii_lowercase();
|
||||
detail.contains("database is locked") || detail.contains("database is busy")
|
||||
}
|
||||
|
||||
pub(crate) fn confirm_repair(startup_error: &LocalStateDbStartupError) -> std::io::Result<bool> {
|
||||
eprintln!("Codex couldn't start because its local database appears to be damaged.");
|
||||
eprintln!("Codex can try a safe repair by backing up those files and rebuilding them.");
|
||||
print_technical_details(startup_error);
|
||||
crate::confirm("Repair Codex local data now? [y/N]: ")
|
||||
}
|
||||
|
||||
pub(crate) async fn repair_files(
|
||||
startup_error: &LocalStateDbStartupError,
|
||||
) -> std::io::Result<Vec<PathBuf>> {
|
||||
let state_db_path = startup_error.state_db_path();
|
||||
let sqlite_home = state_db_path.parent().ok_or_else(|| {
|
||||
std::io::Error::other("state database path does not have a parent directory")
|
||||
})?;
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map_or(0, |duration| duration.as_secs());
|
||||
let repair_suffix = format!("codex-repair-{timestamp}");
|
||||
let mut backups = Vec::new();
|
||||
|
||||
match tokio::fs::metadata(sqlite_home).await {
|
||||
Ok(metadata) if metadata.is_dir() => {}
|
||||
Ok(_) => {
|
||||
backups.push(backup_path(sqlite_home, &repair_suffix).await?);
|
||||
tokio::fs::create_dir_all(sqlite_home).await?;
|
||||
}
|
||||
Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
|
||||
tokio::fs::create_dir_all(sqlite_home).await?;
|
||||
}
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
|
||||
let logs_db_path = codex_state::logs_db_path(sqlite_home);
|
||||
for path in sqlite_paths(state_db_path)
|
||||
.into_iter()
|
||||
.chain(sqlite_paths(logs_db_path.as_path()))
|
||||
{
|
||||
if tokio::fs::try_exists(path.as_path()).await? {
|
||||
backups.push(backup_path(path.as_path(), &repair_suffix).await?);
|
||||
}
|
||||
}
|
||||
|
||||
if backups.is_empty() {
|
||||
return Err(std::io::Error::other(
|
||||
"no repairable Codex local data files were found",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(backups)
|
||||
}
|
||||
|
||||
pub(crate) fn print_repair_backups(backups: &[PathBuf]) {
|
||||
eprintln!("Backed up Codex local data before repair:");
|
||||
for backup in backups {
|
||||
eprintln!(" {}", backup.display());
|
||||
}
|
||||
eprintln!("Retrying startup with rebuilt local data...");
|
||||
}
|
||||
|
||||
pub(crate) fn print_diagnostic_guidance(startup_error: &LocalStateDbStartupError) {
|
||||
eprintln!("Codex couldn't start because its local database appears to be damaged.");
|
||||
eprintln!("Run `codex doctor` to check your setup and get next-step guidance.");
|
||||
eprintln!("If this keeps happening, share the technical details below when asking for help.");
|
||||
print_technical_details(startup_error);
|
||||
}
|
||||
|
||||
pub(crate) fn print_locked_guidance(startup_error: &LocalStateDbStartupError) {
|
||||
eprintln!("Codex couldn't start because another Codex process is using its local data.");
|
||||
eprintln!("Quit any other copies of Codex that may still be running, then try again.");
|
||||
print_technical_details(startup_error);
|
||||
}
|
||||
|
||||
fn sqlite_paths(db_path: &std::path::Path) -> Vec<PathBuf> {
|
||||
let mut wal_path = db_path.as_os_str().to_os_string();
|
||||
wal_path.push("-wal");
|
||||
let mut shm_path = db_path.as_os_str().to_os_string();
|
||||
shm_path.push("-shm");
|
||||
vec![
|
||||
db_path.to_path_buf(),
|
||||
PathBuf::from(wal_path),
|
||||
PathBuf::from(shm_path),
|
||||
]
|
||||
}
|
||||
|
||||
async fn backup_path(path: &std::path::Path, repair_suffix: &str) -> std::io::Result<PathBuf> {
|
||||
let file_name = path.file_name().ok_or_else(|| {
|
||||
std::io::Error::other(format!(
|
||||
"cannot create a repair backup name for {}",
|
||||
path.display()
|
||||
))
|
||||
})?;
|
||||
let mut sequence = 0;
|
||||
loop {
|
||||
let mut backup_name = file_name.to_os_string();
|
||||
backup_name.push(format!(".{repair_suffix}.{sequence}.bak"));
|
||||
let backup_path = path.with_file_name(backup_name);
|
||||
if !tokio::fs::try_exists(backup_path.as_path()).await? {
|
||||
tokio::fs::rename(path, backup_path.as_path()).await?;
|
||||
return Ok(backup_path);
|
||||
}
|
||||
sequence += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn print_technical_details(startup_error: &LocalStateDbStartupError) {
|
||||
eprintln!("Technical details:");
|
||||
eprintln!(" Location: {}", startup_error.state_db_path().display());
|
||||
eprintln!(" Cause: {}", startup_error.detail());
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn repair_backs_up_owned_database_files() -> std::io::Result<()> {
|
||||
let temp_dir = TempDir::new()?;
|
||||
let state_path = codex_state::state_db_path(temp_dir.path());
|
||||
let logs_path = codex_state::logs_db_path(temp_dir.path());
|
||||
let state_sidecars = sqlite_paths(state_path.as_path());
|
||||
tokio::fs::write(state_path.as_path(), b"state").await?;
|
||||
tokio::fs::write(state_sidecars[1].as_path(), b"state-wal").await?;
|
||||
tokio::fs::write(logs_path.as_path(), b"logs").await?;
|
||||
|
||||
let startup_error =
|
||||
LocalStateDbStartupError::new(state_path.clone(), "corrupt".to_string());
|
||||
let backups = repair_files(&startup_error).await?;
|
||||
|
||||
assert_eq!(backups.len(), 3);
|
||||
assert!(!tokio::fs::try_exists(state_path.as_path()).await?);
|
||||
assert!(!tokio::fs::try_exists(state_sidecars[1].as_path()).await?);
|
||||
assert!(!tokio::fs::try_exists(logs_path.as_path()).await?);
|
||||
for backup in backups {
|
||||
assert!(tokio::fs::try_exists(backup.as_path()).await?);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn repair_replaces_blocking_sqlite_home_file() -> std::io::Result<()> {
|
||||
let temp_dir = TempDir::new()?;
|
||||
let sqlite_home = temp_dir.path().join("sqlite-home");
|
||||
tokio::fs::write(sqlite_home.as_path(), b"not-a-directory").await?;
|
||||
let startup_error = LocalStateDbStartupError::new(
|
||||
codex_state::state_db_path(sqlite_home.as_path()),
|
||||
"File exists".to_string(),
|
||||
);
|
||||
|
||||
let backups = repair_files(&startup_error).await?;
|
||||
|
||||
assert_eq!(backups.len(), 1);
|
||||
assert!(tokio::fs::metadata(sqlite_home.as_path()).await?.is_dir());
|
||||
assert!(tokio::fs::try_exists(backups[0].as_path()).await?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lock_failures_skip_repair() {
|
||||
assert!(is_locked("database is locked"));
|
||||
assert!(is_locked("database is busy"));
|
||||
assert!(!is_locked("database disk image is malformed"));
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,7 @@ use codex_utils_cli::ApprovalModeCliArg;
|
||||
use codex_utils_cli::CliConfigOverrides;
|
||||
use codex_utils_cli::SharedCliOptions;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[derive(Parser, Clone, Debug)]
|
||||
#[command(version)]
|
||||
pub struct Cli {
|
||||
/// Optional user prompt to start the session.
|
||||
@@ -89,7 +89,7 @@ impl std::ops::DerefMut for Cli {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct TuiSharedCliOptions(SharedCliOptions);
|
||||
|
||||
impl TuiSharedCliOptions {
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::legacy_core::format_exec_policy_error_with_source;
|
||||
use crate::legacy_core::windows_sandbox::WindowsSandboxLevelExt;
|
||||
use crate::session_resume::ResolveCwdOutcome;
|
||||
use crate::session_resume::resolve_cwd_for_resume_or_fork;
|
||||
pub use crate::startup_error::LocalStateDbStartupError;
|
||||
use additional_dirs::add_dir_warning_message;
|
||||
use app::App;
|
||||
pub use app::AppExitInfo;
|
||||
@@ -167,6 +168,7 @@ mod session_state;
|
||||
mod shimmer;
|
||||
mod skills_helpers;
|
||||
mod slash_command;
|
||||
mod startup_error;
|
||||
mod startup_hooks_review;
|
||||
mod status;
|
||||
mod status_indicator_widget;
|
||||
@@ -312,6 +314,21 @@ pub(crate) enum AppServerTarget {
|
||||
Remote { endpoint: RemoteAppServerEndpoint },
|
||||
}
|
||||
|
||||
async fn init_state_db_for_app_server_target(
|
||||
config: &Config,
|
||||
app_server_target: &AppServerTarget,
|
||||
) -> std::io::Result<Option<StateDbHandle>> {
|
||||
match app_server_target {
|
||||
AppServerTarget::Embedded => state_db::try_init(config).await.map(Some).map_err(|err| {
|
||||
std::io::Error::other(LocalStateDbStartupError::new(
|
||||
codex_state::state_db_path(config.sqlite_home.as_path()),
|
||||
err.to_string(),
|
||||
))
|
||||
}),
|
||||
AppServerTarget::Remote { .. } => Ok(state_db::get_state_db(config).await),
|
||||
}
|
||||
}
|
||||
|
||||
fn remote_addr_has_explicit_port(addr: &str, parsed: &Url) -> bool {
|
||||
let Some(host) = parsed.host_str() else {
|
||||
return false;
|
||||
@@ -509,7 +526,7 @@ pub(crate) async fn start_app_server_for_picker(
|
||||
pub(crate) async fn start_embedded_app_server_for_picker(
|
||||
config: &Config,
|
||||
) -> color_eyre::Result<AppServerSession> {
|
||||
let state_db = state_db::init(config).await;
|
||||
let state_db = init_state_db_for_app_server_target(config, &AppServerTarget::Embedded).await?;
|
||||
start_app_server_for_picker(
|
||||
config,
|
||||
&AppServerTarget::Embedded,
|
||||
@@ -989,10 +1006,7 @@ pub async fn run_main(
|
||||
otel.as_ref(),
|
||||
otel_originator.as_str(),
|
||||
);
|
||||
let state_db = match &app_server_target {
|
||||
AppServerTarget::Embedded => state_db::init(&config).await,
|
||||
AppServerTarget::Remote { .. } => state_db::get_state_db(&config).await,
|
||||
};
|
||||
let state_db = init_state_db_for_app_server_target(&config, &app_server_target).await?;
|
||||
|
||||
let effective_toml = config.config_layer_stack.effective_config();
|
||||
match effective_toml.try_into() {
|
||||
@@ -1823,7 +1837,8 @@ mod tests {
|
||||
async fn start_test_embedded_app_server(
|
||||
config: Config,
|
||||
) -> color_eyre::Result<InProcessAppServerClient> {
|
||||
let state_db = state_db::init(&config).await;
|
||||
let state_db =
|
||||
init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await?;
|
||||
start_embedded_app_server(
|
||||
Arg0DispatchPaths::default(),
|
||||
config,
|
||||
@@ -2416,6 +2431,37 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embedded_state_db_failure_is_typed_for_cli_recovery() -> color_eyre::Result<()> {
|
||||
let temp_dir = TempDir::new()?;
|
||||
let mut config = build_config(&temp_dir).await?;
|
||||
let occupied_sqlite_home = temp_dir.path().join("sqlite-home");
|
||||
std::fs::write(&occupied_sqlite_home, "occupied")?;
|
||||
config.sqlite_home = occupied_sqlite_home.clone();
|
||||
|
||||
let err =
|
||||
match init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await {
|
||||
Ok(_) => panic!("embedded startup should surface state db init failures"),
|
||||
Err(err) => err,
|
||||
};
|
||||
let startup_error = err
|
||||
.get_ref()
|
||||
.and_then(|err| err.downcast_ref::<LocalStateDbStartupError>())
|
||||
.expect("state db startup failure should retain its typed context");
|
||||
|
||||
assert_eq!(
|
||||
startup_error.state_db_path(),
|
||||
codex_state::state_db_path(occupied_sqlite_home.as_path()).as_path()
|
||||
);
|
||||
assert!(
|
||||
startup_error
|
||||
.detail()
|
||||
.contains("failed to initialize state runtime"),
|
||||
"startup error should preserve the underlying state db failure"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
#[tokio::test]
|
||||
#[serial]
|
||||
async fn windows_shows_trust_prompt_with_sandbox() -> std::io::Result<()> {
|
||||
|
||||
29
codex-rs/tui/src/startup_error.rs
Normal file
29
codex-rs/tui/src/startup_error.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error(
|
||||
"failed to initialize sqlite state db at {}: {detail}",
|
||||
state_db_path.display()
|
||||
)]
|
||||
pub struct LocalStateDbStartupError {
|
||||
state_db_path: PathBuf,
|
||||
detail: String,
|
||||
}
|
||||
|
||||
impl LocalStateDbStartupError {
|
||||
pub fn new(state_db_path: PathBuf, detail: String) -> Self {
|
||||
Self {
|
||||
state_db_path,
|
||||
detail,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn state_db_path(&self) -> &Path {
|
||||
self.state_db_path.as_path()
|
||||
}
|
||||
|
||||
pub fn detail(&self) -> &str {
|
||||
self.detail.as_str()
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,7 @@ use clap::Args;
|
||||
use codex_protocol::config_types::ProfileV2Name;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Args, Debug, Default)]
|
||||
#[derive(Args, Clone, Debug, Default)]
|
||||
pub struct SharedCliOptions {
|
||||
/// Optional image(s) to attach to the initial prompt.
|
||||
#[arg(
|
||||
|
||||
Reference in New Issue
Block a user