feat: align memory phase 1 and make it stronger (#11300)

## Align with the new phase-1 design Basically we know run phase 1 in parallel by considering: * Max 64 rollouts * Max 1 month old * Consider the most recent first This PR also adds stronger parallelization capabilities by detecting stale jobs, retry policies, ownership of computation to prevent double computations etc etc
2026-05-02 10:26:45 +00:00 · 2026-02-10 13:42:09 +00:00
parent 223fadc760
commit 1d5eba0090
10 changed files with 1553 additions and 259 deletions
--- a/codex-rs/state/src/runtime.rs
+++ b/codex-rs/state/src/runtime.rs
@@ -36,10 +36,13 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;
 use tracing::warn;
+use uuid::Uuid;

 pub const STATE_DB_FILENAME: &str = "state";
 pub const STATE_DB_VERSION: u32 = 4;

+const MEMORY_SCOPE_KIND_CWD: &str = "cwd";
+
 const METRIC_DB_INIT: &str = "codex.db.init";

 #[derive(Clone)]
@@ -49,6 +52,14 @@ pub struct StateRuntime {
    pool: Arc<sqlx::SqlitePool>,
 }

+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Phase1JobClaimOutcome {
+    Claimed { ownership_token: String },
+    SkippedTerminalFailure,
+    SkippedUpToDate,
+    SkippedRunning,
+}
+
 impl StateRuntime {
    /// Initialize the state runtime using the provided Codex home and default provider.
    ///
@@ -237,9 +248,21 @@ ORDER BY position ASC
    ) -> anyhow::Result<Option<ThreadMemory>> {
        let row = sqlx::query(
            r#"
-SELECT thread_id, trace_summary AS raw_memory, memory_summary, updated_at
+SELECT
+    thread_id,
+    scope_kind,
+    scope_key,
+    raw_memory,
+    memory_summary,
+    updated_at,
+    last_used_at,
+    used_count,
+    invalidated_at,
+    invalid_reason
 FROM thread_memory
 WHERE thread_id = ?
+ORDER BY updated_at DESC, scope_kind DESC, scope_key DESC
+LIMIT 1
            "#,
        )
        .bind(thread_id.to_string())
@@ -506,7 +529,7 @@ ON CONFLICT(id) DO UPDATE SET
        Ok(())
    }

-    /// Insert or update memory summaries for a thread.
+    /// Insert or update memory summaries for a thread in the cwd scope.
    ///
    /// This method always advances `updated_at`, even if summaries are unchanged.
    pub async fn upsert_thread_memory(
@@ -514,6 +537,29 @@ ON CONFLICT(id) DO UPDATE SET
        thread_id: ThreadId,
        raw_memory: &str,
        memory_summary: &str,
+    ) -> anyhow::Result<ThreadMemory> {
+        let Some(thread) = self.get_thread(thread_id).await? else {
+            return Err(anyhow::anyhow!("thread not found: {thread_id}"));
+        };
+        let scope_key = thread.cwd.display().to_string();
+        self.upsert_thread_memory_for_scope(
+            thread_id,
+            MEMORY_SCOPE_KIND_CWD,
+            scope_key.as_str(),
+            raw_memory,
+            memory_summary,
+        )
+        .await
+    }
+
+    /// Insert or update memory summaries for a thread in an explicit scope.
+    pub async fn upsert_thread_memory_for_scope(
+        &self,
+        thread_id: ThreadId,
+        scope_kind: &str,
+        scope_key: &str,
+        raw_memory: &str,
+        memory_summary: &str,
    ) -> anyhow::Result<ThreadMemory> {
        if self.get_thread(thread_id).await?.is_none() {
            return Err(anyhow::anyhow!("thread not found: {thread_id}"));
@@ -524,12 +570,14 @@ ON CONFLICT(id) DO UPDATE SET
            r#"
 INSERT INTO thread_memory (
    thread_id,
-    trace_summary,
+    scope_kind,
+    scope_key,
+    raw_memory,
    memory_summary,
    updated_at
-) VALUES (?, ?, ?, ?)
-ON CONFLICT(thread_id) DO UPDATE SET
-    trace_summary = excluded.trace_summary,
+) VALUES (?, ?, ?, ?, ?, ?)
+ON CONFLICT(thread_id, scope_kind, scope_key) DO UPDATE SET
+    raw_memory = excluded.raw_memory,
    memory_summary = excluded.memory_summary,
    updated_at = CASE
        WHEN excluded.updated_at <= thread_memory.updated_at THEN thread_memory.updated_at + 1
@@ -538,22 +586,149 @@ ON CONFLICT(thread_id) DO UPDATE SET
            "#,
        )
        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
        .bind(raw_memory)
        .bind(memory_summary)
        .bind(updated_at)
        .execute(self.pool.as_ref())
        .await?;

-        self.get_thread_memory(thread_id)
-            .await?
+        let row = sqlx::query(
+            r#"
+SELECT
+    thread_id,
+    scope_kind,
+    scope_key,
+    raw_memory,
+    memory_summary,
+    updated_at,
+    last_used_at,
+    used_count,
+    invalidated_at,
+    invalid_reason
+FROM thread_memory
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+            "#,
+        )
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .fetch_optional(self.pool.as_ref())
+        .await?;
+
+        row.map(|row| ThreadMemoryRow::try_from_row(&row).and_then(ThreadMemory::try_from))
+            .transpose()?
            .ok_or_else(|| anyhow::anyhow!("failed to load upserted thread memory: {thread_id}"))
    }

+    /// Insert or update memory summaries for a thread/scope only if the caller
+    /// still owns the corresponding phase-1 running job.
+    pub async fn upsert_thread_memory_for_scope_if_phase1_owner(
+        &self,
+        thread_id: ThreadId,
+        scope_kind: &str,
+        scope_key: &str,
+        ownership_token: &str,
+        raw_memory: &str,
+        memory_summary: &str,
+    ) -> anyhow::Result<Option<ThreadMemory>> {
+        if self.get_thread(thread_id).await?.is_none() {
+            return Err(anyhow::anyhow!("thread not found: {thread_id}"));
+        }
+
+        let updated_at = Utc::now().timestamp();
+        let rows_affected = sqlx::query(
+            r#"
+INSERT INTO thread_memory (
+    thread_id,
+    scope_kind,
+    scope_key,
+    raw_memory,
+    memory_summary,
+    updated_at
+)
+SELECT ?, ?, ?, ?, ?, ?
+WHERE EXISTS (
+    SELECT 1
+    FROM memory_phase1_jobs
+    WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+      AND status = 'running' AND ownership_token = ?
+)
+ON CONFLICT(thread_id, scope_kind, scope_key) DO UPDATE SET
+    raw_memory = excluded.raw_memory,
+    memory_summary = excluded.memory_summary,
+    updated_at = CASE
+        WHEN excluded.updated_at <= thread_memory.updated_at THEN thread_memory.updated_at + 1
+        ELSE excluded.updated_at
+    END
+            "#,
+        )
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .bind(raw_memory)
+        .bind(memory_summary)
+        .bind(updated_at)
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .bind(ownership_token)
+        .execute(self.pool.as_ref())
+        .await?
+        .rows_affected();
+
+        if rows_affected == 0 {
+            return Ok(None);
+        }
+
+        let row = sqlx::query(
+            r#"
+SELECT
+    thread_id,
+    scope_kind,
+    scope_key,
+    raw_memory,
+    memory_summary,
+    updated_at,
+    last_used_at,
+    used_count,
+    invalidated_at,
+    invalid_reason
+FROM thread_memory
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+            "#,
+        )
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .fetch_optional(self.pool.as_ref())
+        .await?;
+
+        row.map(|row| ThreadMemoryRow::try_from_row(&row).and_then(ThreadMemory::try_from))
+            .transpose()
+    }
+
    /// Get the last `n` memories for threads with an exact cwd match.
    pub async fn get_last_n_thread_memories_for_cwd(
        &self,
        cwd: &Path,
        n: usize,
+    ) -> anyhow::Result<Vec<ThreadMemory>> {
+        self.get_last_n_thread_memories_for_scope(
+            MEMORY_SCOPE_KIND_CWD,
+            &cwd.display().to_string(),
+            n,
+        )
+        .await
+    }
+
+    /// Get the last `n` memories for a specific memory scope.
+    pub async fn get_last_n_thread_memories_for_scope(
+        &self,
+        scope_kind: &str,
+        scope_key: &str,
+        n: usize,
    ) -> anyhow::Result<Vec<ThreadMemory>> {
        if n == 0 {
            return Ok(Vec::new());
@@ -562,18 +737,24 @@ ON CONFLICT(thread_id) DO UPDATE SET
        let rows = sqlx::query(
            r#"
 SELECT
-    m.thread_id,
-    m.trace_summary AS raw_memory,
-    m.memory_summary,
-    m.updated_at
-FROM thread_memory AS m
-INNER JOIN threads AS t ON t.id = m.thread_id
-WHERE t.cwd = ?
-ORDER BY m.updated_at DESC, m.thread_id DESC
+    thread_id,
+    scope_kind,
+    scope_key,
+    raw_memory,
+    memory_summary,
+    updated_at,
+    last_used_at,
+    used_count,
+    invalidated_at,
+    invalid_reason
+FROM thread_memory
+WHERE scope_kind = ? AND scope_key = ? AND invalidated_at IS NULL
+ORDER BY updated_at DESC, thread_id DESC
 LIMIT ?
            "#,
        )
-        .bind(cwd.display().to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
        .bind(n as i64)
        .fetch_all(self.pool.as_ref())
        .await?;
@@ -583,6 +764,282 @@ LIMIT ?
            .collect()
    }

+    /// Try to claim a phase-1 memory extraction job for `(thread, scope)`.
+    pub async fn try_claim_phase1_job(
+        &self,
+        thread_id: ThreadId,
+        scope_kind: &str,
+        scope_key: &str,
+        owner_session_id: ThreadId,
+        source_updated_at: i64,
+        lease_seconds: i64,
+    ) -> anyhow::Result<Phase1JobClaimOutcome> {
+        let now = Utc::now().timestamp();
+        let stale_cutoff = now.saturating_sub(lease_seconds.max(0));
+        let ownership_token = Uuid::new_v4().to_string();
+        let thread_id = thread_id.to_string();
+        let owner_session_id = owner_session_id.to_string();
+
+        let mut tx = self.pool.begin().await?;
+        let existing = sqlx::query(
+            r#"
+SELECT status, source_updated_at, started_at
+FROM memory_phase1_jobs
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+            "#,
+        )
+        .bind(thread_id.as_str())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .fetch_optional(&mut *tx)
+        .await?;
+
+        let Some(existing) = existing else {
+            sqlx::query(
+                r#"
+INSERT INTO memory_phase1_jobs (
+    thread_id,
+    scope_kind,
+    scope_key,
+    status,
+    owner_session_id,
+    started_at,
+    finished_at,
+    failure_reason,
+    source_updated_at,
+    raw_memory_path,
+    summary_hash,
+    ownership_token
+) VALUES (?, ?, ?, 'running', ?, ?, NULL, NULL, ?, NULL, NULL, ?)
+                "#,
+            )
+            .bind(thread_id.as_str())
+            .bind(scope_kind)
+            .bind(scope_key)
+            .bind(owner_session_id.as_str())
+            .bind(now)
+            .bind(source_updated_at)
+            .bind(ownership_token.as_str())
+            .execute(&mut *tx)
+            .await?;
+            tx.commit().await?;
+            return Ok(Phase1JobClaimOutcome::Claimed { ownership_token });
+        };
+
+        let status: String = existing.try_get("status")?;
+        let existing_source_updated_at: i64 = existing.try_get("source_updated_at")?;
+        let existing_started_at: Option<i64> = existing.try_get("started_at")?;
+        if status == "failed" {
+            tx.commit().await?;
+            return Ok(Phase1JobClaimOutcome::SkippedTerminalFailure);
+        }
+        if status == "succeeded" && existing_source_updated_at >= source_updated_at {
+            tx.commit().await?;
+            return Ok(Phase1JobClaimOutcome::SkippedUpToDate);
+        }
+        if status == "running" && existing_started_at.is_some_and(|started| started > stale_cutoff)
+        {
+            tx.commit().await?;
+            return Ok(Phase1JobClaimOutcome::SkippedRunning);
+        }
+
+        let rows_affected = if let Some(existing_started_at) = existing_started_at {
+            sqlx::query(
+                r#"
+UPDATE memory_phase1_jobs
+SET
+    status = 'running',
+    owner_session_id = ?,
+    started_at = ?,
+    finished_at = NULL,
+    failure_reason = NULL,
+    source_updated_at = ?,
+    raw_memory_path = NULL,
+    summary_hash = NULL,
+    ownership_token = ?
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+  AND status = ? AND source_updated_at = ? AND started_at = ?
+                "#,
+            )
+            .bind(owner_session_id.as_str())
+            .bind(now)
+            .bind(source_updated_at)
+            .bind(ownership_token.as_str())
+            .bind(thread_id.as_str())
+            .bind(scope_kind)
+            .bind(scope_key)
+            .bind(status.as_str())
+            .bind(existing_source_updated_at)
+            .bind(existing_started_at)
+            .execute(&mut *tx)
+            .await?
+            .rows_affected()
+        } else {
+            sqlx::query(
+                r#"
+UPDATE memory_phase1_jobs
+SET
+    status = 'running',
+    owner_session_id = ?,
+    started_at = ?,
+    finished_at = NULL,
+    failure_reason = NULL,
+    source_updated_at = ?,
+    raw_memory_path = NULL,
+    summary_hash = NULL,
+    ownership_token = ?
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+  AND status = ? AND source_updated_at = ? AND started_at IS NULL
+                "#,
+            )
+            .bind(owner_session_id.as_str())
+            .bind(now)
+            .bind(source_updated_at)
+            .bind(ownership_token.as_str())
+            .bind(thread_id.as_str())
+            .bind(scope_kind)
+            .bind(scope_key)
+            .bind(status.as_str())
+            .bind(existing_source_updated_at)
+            .execute(&mut *tx)
+            .await?
+            .rows_affected()
+        };
+
+        tx.commit().await?;
+        if rows_affected == 0 {
+            Ok(Phase1JobClaimOutcome::SkippedRunning)
+        } else {
+            Ok(Phase1JobClaimOutcome::Claimed { ownership_token })
+        }
+    }
+
+    /// Finalize a claimed phase-1 job as succeeded.
+    pub async fn mark_phase1_job_succeeded(
+        &self,
+        thread_id: ThreadId,
+        scope_kind: &str,
+        scope_key: &str,
+        ownership_token: &str,
+        raw_memory_path: &str,
+        summary_hash: &str,
+    ) -> anyhow::Result<bool> {
+        let now = Utc::now().timestamp();
+        let rows_affected = sqlx::query(
+            r#"
+UPDATE memory_phase1_jobs
+SET
+    status = 'succeeded',
+    finished_at = ?,
+    failure_reason = NULL,
+    raw_memory_path = ?,
+    summary_hash = ?
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+  AND status = 'running' AND ownership_token = ?
+            "#,
+        )
+        .bind(now)
+        .bind(raw_memory_path)
+        .bind(summary_hash)
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .bind(ownership_token)
+        .execute(self.pool.as_ref())
+        .await?
+        .rows_affected();
+        Ok(rows_affected > 0)
+    }
+
+    /// Finalize a claimed phase-1 job as failed.
+    pub async fn mark_phase1_job_failed(
+        &self,
+        thread_id: ThreadId,
+        scope_kind: &str,
+        scope_key: &str,
+        ownership_token: &str,
+        failure_reason: &str,
+    ) -> anyhow::Result<bool> {
+        let now = Utc::now().timestamp();
+        let rows_affected = sqlx::query(
+            r#"
+UPDATE memory_phase1_jobs
+SET
+    status = 'failed',
+    finished_at = ?,
+    failure_reason = ?
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+  AND status = 'running' AND ownership_token = ?
+            "#,
+        )
+        .bind(now)
+        .bind(failure_reason)
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .bind(ownership_token)
+        .execute(self.pool.as_ref())
+        .await?
+        .rows_affected();
+        Ok(rows_affected > 0)
+    }
+
+    /// Refresh lease timestamp for a claimed phase-1 job.
+    ///
+    /// Returns `true` only when the current owner token still matches.
+    pub async fn renew_phase1_job_lease(
+        &self,
+        thread_id: ThreadId,
+        scope_kind: &str,
+        scope_key: &str,
+        ownership_token: &str,
+    ) -> anyhow::Result<bool> {
+        let now = Utc::now().timestamp();
+        let rows_affected = sqlx::query(
+            r#"
+UPDATE memory_phase1_jobs
+SET started_at = ?
+WHERE thread_id = ? AND scope_kind = ? AND scope_key = ?
+  AND status = 'running' AND ownership_token = ?
+            "#,
+        )
+        .bind(now)
+        .bind(thread_id.to_string())
+        .bind(scope_kind)
+        .bind(scope_key)
+        .bind(ownership_token)
+        .execute(self.pool.as_ref())
+        .await?
+        .rows_affected();
+        Ok(rows_affected > 0)
+    }
+
+    /// Mark a memory scope as dirty/clean for phase-2 consolidation scheduling.
+    pub async fn mark_memory_scope_dirty(
+        &self,
+        scope_kind: &str,
+        scope_key: &str,
+        dirty: bool,
+    ) -> anyhow::Result<()> {
+        let now = Utc::now().timestamp();
+        sqlx::query(
+            r#"
+INSERT INTO memory_scope_dirty (scope_kind, scope_key, dirty, updated_at)
+VALUES (?, ?, ?, ?)
+ON CONFLICT(scope_kind, scope_key) DO UPDATE SET
+    dirty = excluded.dirty,
+    updated_at = excluded.updated_at
+            "#,
+        )
+        .bind(scope_kind)
+        .bind(scope_key)
+        .bind(dirty)
+        .bind(now)
+        .execute(self.pool.as_ref())
+        .await?;
+        Ok(())
+    }
+
    /// Try to acquire or renew the per-cwd memory consolidation lock.
    ///
    /// Returns `true` when the lock is acquired/renewed for `working_thread_id`.
@@ -1020,6 +1477,7 @@ fn push_thread_order_and_limit(

 #[cfg(test)]
 mod tests {
+    use super::Phase1JobClaimOutcome;
    use super::STATE_DB_FILENAME;
    use super::STATE_DB_VERSION;
    use super::StateRuntime;
@@ -1471,6 +1929,272 @@ mod tests {
        let _ = tokio::fs::remove_dir_all(codex_home).await;
    }

+    #[tokio::test]
+    async fn phase1_job_claim_and_success_require_current_owner_token() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+        let owner = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let cwd = codex_home.join("workspace");
+        runtime
+            .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd))
+            .await
+            .expect("upsert thread");
+
+        let claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner, 100, 3600)
+            .await
+            .expect("claim phase1 job");
+        let ownership_token = match claim {
+            Phase1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected claim outcome: {other:?}"),
+        };
+
+        assert!(
+            !runtime
+                .mark_phase1_job_succeeded(
+                    thread_id,
+                    "cwd",
+                    "scope",
+                    "wrong-token",
+                    "/tmp/path",
+                    "summary-hash"
+                )
+                .await
+                .expect("mark succeeded wrong token should fail"),
+            "wrong token should not finalize the job"
+        );
+        assert!(
+            runtime
+                .mark_phase1_job_succeeded(
+                    thread_id,
+                    "cwd",
+                    "scope",
+                    ownership_token.as_str(),
+                    "/tmp/path",
+                    "summary-hash"
+                )
+                .await
+                .expect("mark succeeded with current token"),
+            "current token should finalize the job"
+        );
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
+    #[tokio::test]
+    async fn phase1_job_running_stale_can_be_stolen_but_fresh_running_is_skipped() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+        let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let cwd = codex_home.join("workspace");
+        runtime
+            .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd))
+            .await
+            .expect("upsert thread");
+
+        let first_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_a, 100, 3600)
+            .await
+            .expect("first claim");
+        assert!(
+            matches!(first_claim, Phase1JobClaimOutcome::Claimed { .. }),
+            "first claim should acquire"
+        );
+
+        let fresh_second_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_b, 100, 3600)
+            .await
+            .expect("fresh second claim");
+        assert_eq!(fresh_second_claim, Phase1JobClaimOutcome::SkippedRunning);
+
+        let stale_second_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_b, 100, 0)
+            .await
+            .expect("stale second claim");
+        assert!(
+            matches!(stale_second_claim, Phase1JobClaimOutcome::Claimed { .. }),
+            "stale running job should be stealable"
+        );
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
+    #[tokio::test]
+    async fn phase1_job_lease_renewal_requires_current_owner_token() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+        let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let cwd = codex_home.join("workspace");
+        runtime
+            .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd))
+            .await
+            .expect("upsert thread");
+
+        let first_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_a, 100, 3600)
+            .await
+            .expect("first claim");
+        let owner_a_token = match first_claim {
+            Phase1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected claim outcome: {other:?}"),
+        };
+
+        let stolen_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_b, 100, 0)
+            .await
+            .expect("stolen claim");
+        let owner_b_token = match stolen_claim {
+            Phase1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected claim outcome: {other:?}"),
+        };
+
+        assert!(
+            !runtime
+                .renew_phase1_job_lease(thread_id, "cwd", "scope", owner_a_token.as_str())
+                .await
+                .expect("old owner lease renewal should fail"),
+            "stale owner token should not renew lease"
+        );
+        assert!(
+            runtime
+                .renew_phase1_job_lease(thread_id, "cwd", "scope", owner_b_token.as_str())
+                .await
+                .expect("current owner lease renewal should succeed"),
+            "current owner token should renew lease"
+        );
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
+    #[tokio::test]
+    async fn phase1_owner_guarded_upsert_rejects_stale_owner() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+        let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let cwd = codex_home.join("workspace");
+        runtime
+            .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd))
+            .await
+            .expect("upsert thread");
+
+        let first_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_a, 100, 3600)
+            .await
+            .expect("first claim");
+        let owner_a_token = match first_claim {
+            Phase1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected claim outcome: {other:?}"),
+        };
+
+        let stolen_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_b, 100, 0)
+            .await
+            .expect("stolen claim");
+        let owner_b_token = match stolen_claim {
+            Phase1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected claim outcome: {other:?}"),
+        };
+
+        let stale_upsert = runtime
+            .upsert_thread_memory_for_scope_if_phase1_owner(
+                thread_id,
+                "cwd",
+                "scope",
+                owner_a_token.as_str(),
+                "stale raw memory",
+                "stale summary",
+            )
+            .await
+            .expect("stale owner upsert");
+        assert!(
+            stale_upsert.is_none(),
+            "stale owner token should not upsert thread memory"
+        );
+
+        let current_upsert = runtime
+            .upsert_thread_memory_for_scope_if_phase1_owner(
+                thread_id,
+                "cwd",
+                "scope",
+                owner_b_token.as_str(),
+                "fresh raw memory",
+                "fresh summary",
+            )
+            .await
+            .expect("current owner upsert");
+        let current_upsert = current_upsert.expect("current owner should upsert");
+        assert_eq!(current_upsert.raw_memory, "fresh raw memory");
+        assert_eq!(current_upsert.memory_summary, "fresh summary");
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
+    #[tokio::test]
+    async fn phase1_job_failed_is_terminal() {
+        let codex_home = unique_temp_dir();
+        let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
+            .await
+            .expect("initialize runtime");
+
+        let thread_id = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("thread id");
+        let owner_a = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let owner_b = ThreadId::from_string(&Uuid::new_v4().to_string()).expect("owner id");
+        let cwd = codex_home.join("workspace");
+        runtime
+            .upsert_thread(&test_thread_metadata(&codex_home, thread_id, cwd))
+            .await
+            .expect("upsert thread");
+
+        let claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_a, 100, 3600)
+            .await
+            .expect("claim");
+        let ownership_token = match claim {
+            Phase1JobClaimOutcome::Claimed { ownership_token } => ownership_token,
+            other => panic!("unexpected claim outcome: {other:?}"),
+        };
+        assert!(
+            runtime
+                .mark_phase1_job_failed(
+                    thread_id,
+                    "cwd",
+                    "scope",
+                    ownership_token.as_str(),
+                    "prompt failed"
+                )
+                .await
+                .expect("mark failed"),
+            "owner token should be able to fail job"
+        );
+
+        let second_claim = runtime
+            .try_claim_phase1_job(thread_id, "cwd", "scope", owner_b, 101, 3600)
+            .await
+            .expect("second claim");
+        assert_eq!(second_claim, Phase1JobClaimOutcome::SkippedTerminalFailure);
+
+        let _ = tokio::fs::remove_dir_all(codex_home).await;
+    }
+
    #[tokio::test]
    async fn deleting_thread_cascades_thread_memory() {
        let codex_home = unique_temp_dir();