feat: drop discrepency metrics (#13753)

This commit is contained in:
jif-oai
2026-03-06 17:32:25 +00:00
committed by GitHub
parent fa16c26908
commit f891f516a5
4 changed files with 11 additions and 29 deletions

View File

@@ -1203,7 +1203,9 @@ async fn find_thread_path_by_id_str_in_subdir(
"state db returned stale rollout path for thread {id_str}: {}",
db_path.display()
);
state_db::record_discrepancy("find_thread_path_by_id_str_in_subdir", "stale_db_path");
tracing::warn!(
"state db discrepancy during find_thread_path_by_id_str_in_subdir: stale_db_path"
);
}
let mut root = codex_home.to_path_buf();
@@ -1227,7 +1229,9 @@ async fn find_thread_path_by_id_str_in_subdir(
let found = results.matches.into_iter().next().map(|m| m.full_path());
if let Some(found_path) = found.as_ref() {
tracing::debug!("state db missing rollout path for thread {id_str}");
state_db::record_discrepancy("find_thread_path_by_id_str_in_subdir", "falling_back");
tracing::warn!(
"state db discrepancy during find_thread_path_by_id_str_in_subdir: falling_back"
);
state_db::read_repair_rollout_path(
state_db_ctx.as_deref(),
thread_id,

View File

@@ -290,7 +290,7 @@ impl RolloutRecorder {
}
// If SQLite listing still fails, return the filesystem page rather than failing the list.
tracing::error!("Falling back on rollout system");
state_db::record_discrepancy("list_threads_with_db_fallback", "falling_back");
tracing::warn!("state db discrepancy during list_threads_with_db_fallback: falling_back");
Ok(truncate_fs_page(fs_page, page_size, sort_key))
}

View File

@@ -12,9 +12,7 @@ use codex_protocol::ThreadId;
use codex_protocol::dynamic_tools::DynamicToolSpec;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::SessionSource;
use codex_state::DB_METRIC_COMPARE_ERROR;
pub use codex_state::LogEntry;
use codex_state::STATE_DB_VERSION;
use codex_state::ThreadMetadataBuilder;
use serde_json::Value;
use std::path::Path;
@@ -267,7 +265,7 @@ pub async fn list_threads_db(
item.id,
item.rollout_path.display()
);
record_discrepancy("list_threads_db", "stale_db_path_dropped");
warn!("state db discrepancy during list_threads_db: stale_db_path_dropped");
let _ = ctx.delete_thread(item.id).await;
}
}
@@ -459,7 +457,7 @@ pub async fn read_repair_rollout_path(
if repaired == metadata {
return;
}
record_discrepancy("read_repair_rollout_path", "upsert_needed");
warn!("state db discrepancy during read_repair_rollout_path: upsert_needed (fast path)");
if let Err(err) = ctx.upsert_thread(&repaired).await {
warn!(
"state db read-repair upsert failed for {}: {err}",
@@ -473,7 +471,7 @@ pub async fn read_repair_rollout_path(
// Slow path: when the row is missing/unreadable (or direct upsert failed),
// rebuild metadata from rollout contents and reconcile it into SQLite.
if !saw_existing_metadata {
record_discrepancy("read_repair_rollout_path", "upsert_needed");
warn!("state db discrepancy during read_repair_rollout_path: upsert_needed (slow path)");
}
let default_provider = crate::rollout::list::read_session_meta_line(rollout_path)
.await
@@ -514,7 +512,7 @@ pub async fn apply_rollout_items(
"state db apply_rollout_items missing builder during {stage}: {}",
rollout_path.display()
);
record_discrepancy(stage, "missing_builder");
warn!("state db discrepancy during apply_rollout_items: {stage}, missing_builder");
return;
}
},
@@ -532,24 +530,6 @@ pub async fn apply_rollout_items(
}
}
/// Record a state discrepancy metric with a stage and reason tag.
pub fn record_discrepancy(stage: &str, reason: &str) {
// We access the global metric because the call sites might not have access to the broader
// OtelManager.
tracing::warn!("state db record_discrepancy: {stage}, {reason}");
if let Some(metric) = codex_otel::metrics::global() {
let _ = metric.counter(
DB_METRIC_COMPARE_ERROR,
1,
&[
("stage", stage),
("reason", reason),
("version", &STATE_DB_VERSION.to_string()),
],
);
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -60,5 +60,3 @@ pub const DB_ERROR_METRIC: &str = "codex.db.error";
pub const DB_METRIC_BACKFILL: &str = "codex.db.backfill";
/// Metrics on backfill duration. Tags: [status]
pub const DB_METRIC_BACKFILL_DURATION_MS: &str = "codex.db.backfill.duration_ms";
/// Metrics on errors during comparison between DB and rollout file. Tags: [stage]
pub const DB_METRIC_COMPARE_ERROR: &str = "codex.db.compare_error";