fix: db stuff mem (#11575)

* Documenting DB functions * Fixing 1 nit where stage-2 was sorting the stage 1 in the wrong direction * Added some tests
2026-05-02 02:17:22 +00:00 · 2026-02-12 12:53:47 +00:00
parent adad23f743
commit 19ab038488
2 changed files with 463 additions and 23 deletions
--- a/codex-rs/state/src/runtime.rs
+++ b/codex-rs/state/src/runtime.rs
@@ -1432,6 +1432,104 @@ WHERE id = 1
        let _ = tokio::fs::remove_dir_all(codex_home).await;
    }

+    #[tokio::test]
+    async fn claim_stage1_jobs_prefilters_threads_with_up_to_date_memory() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let now = Utc::now();
+        let eligible_newer_at = now - Duration::hours(13);
+        let eligible_older_at = now - Duration::hours(14);
+
+        let current_thread_id =
+            ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id");
+        let up_to_date_thread_id =
+            ThreadId::from_string(&Uuid::new_v4().to_string()).expect("up-to-date thread id");
+        let stale_thread_id =
+            ThreadId::from_string(&Uuid::new_v4().to_string()).expect("stale thread id");
+        let worker_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("worker id");
+
+        let mut current =
+            test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current"));
+        current.created_at = now;
+        current.updated_at = now;
+        runtime
+            .upsert_thread(&current)
+            .await
+            .expect("upsert current thread");
+
+        let mut up_to_date = test_thread_metadata(
+            &codex_home,
+            up_to_date_thread_id,
+            codex_home.join("up-to-date"),
+        );
+        up_to_date.created_at = eligible_newer_at;
+        up_to_date.updated_at = eligible_newer_at;
+        runtime
+            .upsert_thread(&up_to_date)
+            .await
+            .expect("upsert up-to-date thread");
+
+        let up_to_date_claim = runtime
+            .try_claim_stage1_job(
+                up_to_date_thread_id,
+                worker_id,
+                up_to_date.updated_at.timestamp(),
+                3600,
+                64,
+            )
+            .await
+            .expect("claim up-to-date thread for seed");
+        let up_to_date_token = match up_to_date_claim {
+            Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected seed claim outcome: {other:?}"),
+        };
+        assert!(
+            runtime
+                .mark_stage1_job_succeeded(
+                    up_to_date_thread_id,
+                    up_to_date_token.as_str(),
+                    up_to_date.updated_at.timestamp(),
+                    "raw",
+                    "summary",
+                )
+                .await
+                .expect("mark up-to-date thread succeeded"),
+            "seed stage1 success should complete for up-to-date thread"
+        );
+
+        let mut stale =
+            test_thread_metadata(&codex_home, stale_thread_id, codex_home.join("stale"));
+        stale.created_at = eligible_older_at;
+        stale.updated_at = eligible_older_at;
+        runtime
+            .upsert_thread(&stale)
+            .await
+            .expect("upsert stale thread");
+
+        let allowed_sources = vec!["cli".to_string()];
+        let claims = runtime
+            .claim_stage1_jobs_for_startup(
+                current_thread_id,
+                Stage1StartupClaimParams {
+                    scan_limit: 1,
+                    max_claimed: 1,
+                    max_age_days: 30,
+                    min_rollout_idle_hours: 12,
+                    allowed_sources: allowed_sources.as_slice(),
+                    lease_seconds: 3600,
+                },
+            )
+            .await
+            .expect("claim stage1 startup jobs");
+        assert_eq!(claims.len(), 1);
+        assert_eq!(claims[0].thread.id, stale_thread_id);
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
    #[tokio::test]
    async fn claim_stage1_jobs_enforces_global_running_cap() {
        let codex_home = unique_temp_dir();
@@ -1559,6 +1657,92 @@ WHERE kind = 'memory_stage1'
        let _ = tokio::fs::remove_dir_all(codex_home).await;
    }

+    #[tokio::test]
+    async fn claim_stage1_jobs_processes_two_full_batches_across_startup_passes() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let current_thread_id =
+            ThreadId::from_string(&Uuid::new_v4().to_string()).expect("current thread id");
+        let mut current =
+            test_thread_metadata(&codex_home, current_thread_id, codex_home.join("current"));
+        current.created_at = Utc::now();
+        current.updated_at = Utc::now();
+        runtime
+            .upsert_thread(&current)
+            .await
+            .expect("upsert current");
+
+        let eligible_at = Utc::now() - Duration::hours(13);
+        for idx in 0..200 {
+            let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+            let mut metadata = test_thread_metadata(
+                &codex_home,
+                thread_id,
+                codex_home.join(format!("thread-{idx}")),
+            );
+            metadata.created_at = eligible_at - Duration::seconds(idx as i64);
+            metadata.updated_at = eligible_at - Duration::seconds(idx as i64);
+            runtime
+                .upsert_thread(&metadata)
+                .await
+                .expect("upsert eligible thread");
+        }
+
+        let allowed_sources = vec!["cli".to_string()];
+        let first_claims = runtime
+            .claim_stage1_jobs_for_startup(
+                current_thread_id,
+                Stage1StartupClaimParams {
+                    scan_limit: 5_000,
+                    max_claimed: 64,
+                    max_age_days: 30,
+                    min_rollout_idle_hours: 12,
+                    allowed_sources: allowed_sources.as_slice(),
+                    lease_seconds: 3_600,
+                },
+            )
+            .await
+            .expect("first stage1 startup claim");
+        assert_eq!(first_claims.len(), 64);
+
+        for claim in first_claims {
+            assert!(
+                runtime
+                    .mark_stage1_job_succeeded(
+                        claim.thread.id,
+                        claim.ownership_token.as_str(),
+                        claim.thread.updated_at.timestamp(),
+                        "raw",
+                        "summary",
+                    )
+                    .await
+                    .expect("mark first-batch stage1 success"),
+                "first batch stage1 completion should succeed"
+            );
+        }
+
+        let second_claims = runtime
+            .claim_stage1_jobs_for_startup(
+                current_thread_id,
+                Stage1StartupClaimParams {
+                    scan_limit: 5_000,
+                    max_claimed: 64,
+                    max_age_days: 30,
+                    min_rollout_idle_hours: 12,
+                    allowed_sources: allowed_sources.as_slice(),
+                    lease_seconds: 3_600,
+                },
+            )
+            .await
+            .expect("second stage1 startup claim");
+        assert_eq!(second_claims.len(), 64);
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
    #[tokio::test]
    async fn stage1_output_cascades_on_thread_delete() {
        let codex_home = unique_temp_dir();
@@ -1695,6 +1879,88 @@ WHERE kind = 'memory_stage1'
        let _ = tokio::fs::remove_dir_all(codex_home).await;
    }

+    #[tokio::test]
+    async fn stage1_retry_exhaustion_does_not_block_newer_watermark() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+        let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        runtime
+            .upsert_thread(&test_thread_metadata(
+                &codex_home,
+                thread_id,
+                codex_home.join("workspace"),
+            ))
+            .await
+            .expect("upsert thread");
+
+        for attempt in 0..3 {
+            let claim = runtime
+                .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64)
+                .await
+                .expect("claim stage1 for retry exhaustion");
+            let ownership_token = match claim {
+                Stage1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+                other => panic!(
+                    "attempt {} should claim stage1 before retries are exhausted: {other:?}",
+                    attempt + 1
+                ),
+            };
+            assert!(
+                runtime
+                    .mark_stage1_job_failed(thread_id, ownership_token.as_str(), "boom", 0)
+                    .await
+                    .expect("mark stage1 failed"),
+                "attempt {} should decrement retry budget",
+                attempt + 1
+            );
+        }
+
+        let exhausted_claim = runtime
+            .try_claim_stage1_job(thread_id, owner, 100, 3_600, 64)
+            .await
+            .expect("claim stage1 after retry exhaustion");
+        assert_eq!(
+            exhausted_claim,
+            Stage1JobClaimOutcome::SkippedRetryExhausted
+        );
+
+        let newer_source_claim = runtime
+            .try_claim_stage1_job(thread_id, owner, 101, 3_600, 64)
+            .await
+            .expect("claim stage1 with newer source watermark");
+        assert!(
+            matches!(newer_source_claim, Stage1JobClaimOutcome::Claimed { .. }),
+            "newer source watermark should reset retry budget and be claimable"
+        );
+
+        let job_row = sqlx::query(
+            "SELECT retry_remaining, input_watermark FROM jobs WHERE kind = ? AND job_key = ?",
+        )
+        .bind("memory_stage1")
+        .bind(thread_id.to_string())
+        .fetch_one(runtime.pool.as_ref())
+        .await
+        .expect("load stage1 job row after newer-source claim");
+        assert_eq!(
+            job_row
+                .try_get::<i64, _>("retry_remaining")
+                .expect("retry_remaining"),
+            3
+        );
+        assert_eq!(
+            job_row
+                .try_get::<i64, _>("input_watermark")
+                .expect("input_watermark"),
+            101
+        );
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
    #[tokio::test]
    async fn phase2_global_consolidation_reruns_when_watermark_advances() {
        let codex_home = unique_temp_dir();
@@ -2078,6 +2344,65 @@ VALUES (?, ?, ?, ?, ?)
        let _ = tokio::fs::remove_dir_all(codex_home).await;
    }

+    #[tokio::test]
+    async fn phase2_backfilled_inputs_below_last_success_still_become_dirty() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        runtime
+            .enqueue_global_consolidation(500)
+            .await
+            .expect("enqueue initial consolidation");
+        let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner a");
+        let claim_a = runtime
+            .try_claim_global_phase2_job(owner_a, 3_600)
+            .await
+            .expect("claim initial consolidation");
+        let token_a = match claim_a {
+            Phase2JobClaimOutcome::Claimed {
+                ownership_token,
+                input_watermark,
+            } => {
+                assert_eq!(input_watermark, 500);
+                ownership_token
+            }
+            other => panic!("unexpected initial phase2 claim outcome: {other:?}"),
+        };
+        assert!(
+            runtime
+                .mark_global_phase2_job_succeeded(token_a.as_str(), 500)
+                .await
+                .expect("mark initial phase2 success"),
+            "initial phase2 success should finalize"
+        );
+
+        runtime
+            .enqueue_global_consolidation(400)
+            .await
+            .expect("enqueue backfilled consolidation");
+
+        let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner b");
+        let claim_b = runtime
+            .try_claim_global_phase2_job(owner_b, 3_600)
+            .await
+            .expect("claim backfilled consolidation");
+        match claim_b {
+            Phase2JobClaimOutcome::Claimed {
+                input_watermark, ..
+            } => {
+                assert!(
+                    input_watermark > 500,
+                    "backfilled enqueue should advance dirty watermark beyond last success"
+                );
+            }
+            other => panic!("unexpected backfilled phase2 claim outcome: {other:?}"),
+        }
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
    #[tokio::test]
    async fn phase2_failure_fallback_updates_unowned_running_job() {
        let codex_home = unique_temp_dir();