feat: use git-backed workspace diffs for memory consolidation (#18982)

## Why

This PR make the `morpheus` agent (memory phase 2) use a git diff to
start it's consolidation. The workflow is the following:
1. The agent acquire a lock
2. If `.codex/memories` does not exist or is not a git root, initialize
everything (and make a first empty commit)
3. Update `raw_memories.md` and `rollout_summaries/` as before.
Basically we select max N phase 1 memories based on a given policy
4. We use git (`gix`) to get a diff between the current state of
`.codex/memories` and the last commit.
5. Dump the diff in `phase2_workspace_diff.md`
6. Spawn `morpheus` and point it to `phase2_workspace_diff.md`
7. Wait for `morpheus` to be done
8. Re-create a new `.git` and make one single commit on it. We do this
because we don't want to preserve history through `.git` and this is
cheap anyway
9. We release the lock
On top of this, we keep the retry policies etc etc

The goals of this new workflow are:
* Better support of any memory extensions such as `chronicle`
* Allow the user to manually edit memories and this will be considered
by the phase 2 agent
 
As a follow-up we will need to add support for user's edition while
`morpheus` is running

## What Changed

- Added memory workspace helpers that prepare the git baseline, compute
the diff, write `phase2_workspace_diff.md`, and reset the baseline after
successful consolidation.
- Updated Phase 2 to sync current inputs into `raw_memories.md` and
`rollout_summaries/`, prune old extension resources, skip clean
workspaces, and run the consolidation subagent only when the workspace
has changes.
- Tightened Phase 2 job ownership around long-running consolidation with
heartbeats and an ownership check before resetting the baseline.
- Simplified the prompt and state APIs so DB watermarks are bookkeeping,
while workspace dirtiness decides whether consolidation work exists.
- Updated the memory pipeline README and tests for workspace diffs,
extension-resource cleanup, pollution-driven forgetting, selection
ranking, and baseline persistence.

## Verification

- Added/updated coverage in `core/src/memories/tests.rs`,
`core/src/memories/workspace_tests.rs`, `state/src/runtime/memories.rs`,
and `core/tests/suite/memories.rs`.

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
jif-oai
2026-04-27 14:32:44 +02:00
committed by GitHub
parent f8c527e529
commit 01ab25dbb5
21 changed files with 1079 additions and 1058 deletions

View File

@@ -2,6 +2,8 @@ use anyhow::Result;
use chrono::Duration as ChronoDuration;
use chrono::Utc;
use codex_features::Feature;
use codex_git_utils::diff_since_latest_init;
use codex_git_utils::reset_git_repository;
use codex_protocol::ThreadId;
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::Op;
@@ -11,9 +13,7 @@ use core_test_support::responses::ResponsesRequest;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_response_created;
use core_test_support::responses::ev_web_search_call_done;
use core_test_support::responses::mount_sse_once;
use core_test_support::responses::mount_sse_sequence;
use core_test_support::responses::sse;
use core_test_support::responses::start_mock_server;
use core_test_support::test_codex::TestCodex;
@@ -27,13 +27,14 @@ use tokio::time::Duration;
use tokio::time::Instant;
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn memories_startup_phase2_tracks_added_and_removed_inputs_across_runs() -> Result<()> {
async fn memories_startup_phase2_tracks_workspace_diff_across_runs() -> Result<()> {
let server = start_mock_server().await;
let home = Arc::new(TempDir::new()?);
let db = init_state_db(&home).await?;
let memory_root = home.path().join("memories");
let now = Utc::now();
let thread_a = seed_stage1_output(
let _thread_a = seed_stage1_output(
db.as_ref(),
home.path(),
now - ChronoDuration::hours(2),
@@ -43,53 +44,21 @@ async fn memories_startup_phase2_tracks_added_and_removed_inputs_across_runs() -
)
.await?;
let first_phase2 = mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-phase2-1"),
ev_assistant_message("msg-phase2-1", "phase2 complete"),
ev_completed("resp-phase2-1"),
]),
let rollout_summaries_root = memory_root.join("rollout_summaries");
tokio::fs::create_dir_all(&rollout_summaries_root).await?;
tokio::fs::write(
memory_root.join("raw_memories.md"),
"# Raw Memories\n\nraw memory A\n",
)
.await;
.await?;
tokio::fs::write(
rollout_summaries_root.join("rollout-a.md"),
"git_branch: branch-rollout-a\n\nrollout summary A\n",
)
.await?;
reset_git_repository(&memory_root).await?;
let first = build_test_codex(&server, home.clone()).await?;
let first_request = wait_for_single_request(&first_phase2).await;
let first_prompt = phase2_prompt_text(&first_request);
assert!(
first_prompt.contains("- selected inputs this run: 1"),
"expected selected count in first prompt: {first_prompt}"
);
assert!(
first_prompt.contains("- newly added since the last successful Phase 2 run: 1"),
"expected added count in first prompt: {first_prompt}"
);
assert!(
first_prompt.contains("- removed from the last successful Phase 2 run: 0"),
"expected removed count in first prompt: {first_prompt}"
);
assert!(
first_prompt.contains(&format!("- [added] thread_id={thread_a},")),
"expected thread A to be marked added: {first_prompt}"
);
assert!(
first_prompt.contains("Removed from the last successful Phase 2 selection:\n- none"),
"expected no removed items in first prompt: {first_prompt}"
);
wait_for_phase2_success(db.as_ref(), thread_a).await?;
let memory_root = home.path().join("memories");
let raw_memories = tokio::fs::read_to_string(memory_root.join("raw_memories.md")).await?;
assert!(raw_memories.contains("raw memory A"));
assert!(!raw_memories.contains("raw memory B"));
let rollout_summaries = read_rollout_summary_bodies(&memory_root).await?;
assert_eq!(rollout_summaries.len(), 1);
assert!(rollout_summaries[0].contains("rollout summary A"));
assert!(rollout_summaries[0].contains("git_branch: branch-rollout-a"));
shutdown_test_codex(&first).await?;
let thread_b = seed_stage1_output(
let _thread_b = seed_stage1_output(
db.as_ref(),
home.path(),
now - ChronoDuration::hours(1),
@@ -99,46 +68,30 @@ async fn memories_startup_phase2_tracks_added_and_removed_inputs_across_runs() -
)
.await?;
let second_phase2 = mount_sse_once(
let phase2 = mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-phase2-2"),
ev_assistant_message("msg-phase2-2", "phase2 complete"),
ev_completed("resp-phase2-2"),
ev_response_created("resp-phase2"),
ev_assistant_message("msg-phase2", "phase2 complete"),
ev_completed("resp-phase2"),
]),
)
.await;
let second = build_test_codex(&server, home.clone()).await?;
let second_request = wait_for_single_request(&second_phase2).await;
let second_prompt = phase2_prompt_text(&second_request);
let codex = build_test_codex(&server, home.clone()).await?;
let request = wait_for_single_request(&phase2).await;
let prompt = phase2_prompt_text(&request);
assert!(
second_prompt.contains("- selected inputs this run: 1"),
"expected selected count in second prompt: {second_prompt}"
);
assert!(
second_prompt.contains("- newly added since the last successful Phase 2 run: 1"),
"expected added count in second prompt: {second_prompt}"
);
assert!(
second_prompt.contains("- removed from the last successful Phase 2 run: 1"),
"expected removed count in second prompt: {second_prompt}"
);
assert!(
second_prompt.contains(&format!("- [added] thread_id={thread_b},")),
"expected thread B to be marked added: {second_prompt}"
);
assert!(
second_prompt.contains(&format!("- thread_id={thread_a},")),
"expected thread A to be marked removed: {second_prompt}"
prompt.contains("phase2_workspace_diff.md"),
"expected workspace diff file in prompt: {prompt}"
);
wait_for_phase2_success(db.as_ref(), thread_b).await?;
wait_for_phase2_workspace_reset(&memory_root).await?;
let raw_memories = tokio::fs::read_to_string(memory_root.join("raw_memories.md")).await?;
assert!(raw_memories.contains("raw memory B"));
assert!(raw_memories.contains("raw memory A"));
assert!(!raw_memories.contains("raw memory A"));
let rollout_summaries = read_rollout_summary_bodies(&memory_root).await?;
assert_eq!(rollout_summaries.len(), 2);
assert_eq!(rollout_summaries.len(), 1);
assert!(
rollout_summaries
.iter()
@@ -152,20 +105,20 @@ async fn memories_startup_phase2_tracks_added_and_removed_inputs_across_runs() -
assert!(
rollout_summaries
.iter()
.any(|summary| summary.contains("rollout summary A"))
.all(|summary| !summary.contains("rollout summary A"))
);
shutdown_test_codex(&second).await?;
shutdown_test_codex(&codex).await?;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn memories_startup_phase2_prunes_old_extension_resources_and_reports_them() -> Result<()> {
async fn memories_startup_phase2_prunes_old_extension_resources() -> Result<()> {
let server = start_mock_server().await;
let home = Arc::new(TempDir::new()?);
let db = init_state_db(&home).await?;
let now = Utc::now();
let thread_id = seed_stage1_output(
let _thread_id = seed_stage1_output(
db.as_ref(),
home.path(),
now - ChronoDuration::hours(1),
@@ -175,11 +128,11 @@ async fn memories_startup_phase2_prunes_old_extension_resources_and_reports_them
)
.await?;
let chronicle_resources = home.path().join("memories_extensions/chronicle/resources");
let chronicle_resources = home.path().join("memories/extensions/chronicle/resources");
tokio::fs::create_dir_all(&chronicle_resources).await?;
tokio::fs::write(
home.path()
.join("memories_extensions/chronicle/instructions.md"),
.join("memories/extensions/chronicle/instructions.md"),
"instructions",
)
.await?;
@@ -210,23 +163,11 @@ async fn memories_startup_phase2_prunes_old_extension_resources_and_reports_them
let prompt = phase2_prompt_text(&request);
assert!(
prompt.contains("Memory extension resources removed by retention pruning:"),
"expected extension resource prune report in prompt: {prompt}"
);
assert!(
prompt.contains("- retention window: 7 days"),
"expected retention window in prompt: {prompt}"
);
assert!(
prompt.contains("- extension: chronicle"),
"expected extension name in prompt: {prompt}"
);
assert!(
prompt.contains(&format!(" - resources/{old_file_name}")),
"expected old resource in prompt: {prompt}"
prompt.contains("phase2_workspace_diff.md"),
"expected workspace diff file in prompt: {prompt}"
);
wait_for_phase2_success(db.as_ref(), thread_id).await?;
wait_for_phase2_workspace_reset(&home.path().join("memories")).await?;
wait_for_file_removed(&old_file).await?;
assert!(
!tokio::fs::try_exists(&old_file).await?,
@@ -242,8 +183,8 @@ async fn memories_startup_phase2_prunes_old_extension_resources_and_reports_them
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn memories_startup_phase2_processes_old_extension_resources_without_stage1_input()
-> Result<()> {
async fn memories_startup_phase2_prunes_old_extension_resources_without_stage1_input() -> Result<()>
{
let server = start_mock_server().await;
let home = Arc::new(TempDir::new()?);
let db = init_state_db(&home).await?;
@@ -251,11 +192,11 @@ async fn memories_startup_phase2_processes_old_extension_resources_without_stage
.await?;
let now = Utc::now();
let chronicle_resources = home.path().join("memories_extensions/chronicle/resources");
let chronicle_resources = home.path().join("memories/extensions/chronicle/resources");
tokio::fs::create_dir_all(&chronicle_resources).await?;
tokio::fs::write(
home.path()
.join("memories_extensions/chronicle/instructions.md"),
.join("memories/extensions/chronicle/instructions.md"),
"instructions",
)
.await?;
@@ -281,189 +222,16 @@ async fn memories_startup_phase2_processes_old_extension_resources_without_stage
let prompt = phase2_prompt_text(&request);
assert!(
prompt.contains("- selected inputs this run: 0"),
"expected no selected raw inputs in prompt: {prompt}"
);
assert!(
prompt.contains("Memory extension resources removed by retention pruning:"),
"expected extension resource prune report in prompt: {prompt}"
);
assert!(
prompt.contains(&format!(" - resources/{old_file_name}")),
"expected old resource in prompt: {prompt}"
prompt.contains("phase2_workspace_diff.md"),
"expected workspace diff file in prompt: {prompt}"
);
wait_for_file_removed(&old_file).await?;
wait_for_phase2_workspace_reset(&home.path().join("memories")).await?;
shutdown_test_codex(&codex).await?;
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn web_search_pollution_moves_selected_thread_into_removed_phase2_inputs() -> Result<()> {
let server = start_mock_server().await;
let home = Arc::new(TempDir::new()?);
let db = init_state_db(&home).await?;
let mut initial_builder = test_codex().with_home(home.clone()).with_config(|config| {
config
.features
.enable(Feature::Sqlite)
.expect("test config should allow feature update");
config
.features
.enable(Feature::MemoryTool)
.expect("test config should allow feature update");
config.memories.max_raw_memories_for_consolidation = 1;
config.memories.disable_on_external_context = true;
});
let initial = initial_builder.build(&server).await?;
mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-initial-1"),
ev_assistant_message("msg-initial-1", "initial turn complete"),
ev_completed("resp-initial-1"),
]),
)
.await;
initial.submit_turn("hello before memories").await?;
let rollout_path = initial
.session_configured
.rollout_path
.clone()
.expect("rollout path");
let thread_id = initial.session_configured.session_id;
let updated_at = {
let deadline = Instant::now() + Duration::from_secs(10);
loop {
if let Some(metadata) = db.get_thread(thread_id).await? {
break metadata.updated_at;
}
assert!(
Instant::now() < deadline,
"timed out waiting for thread metadata for {thread_id}"
);
tokio::time::sleep(Duration::from_millis(50)).await;
}
};
seed_stage1_output_for_existing_thread(
db.as_ref(),
thread_id,
updated_at.timestamp(),
"raw memory seeded for web search pollution",
"rollout summary seeded for web search pollution",
Some("pollution-rollout"),
)
.await?;
shutdown_test_codex(&initial).await?;
let responses = mount_sse_sequence(
&server,
vec![
sse(vec![
ev_response_created("resp-phase2-1"),
ev_assistant_message("msg-phase2-1", "phase2 complete"),
ev_completed("resp-phase2-1"),
]),
sse(vec![
ev_response_created("resp-web-1"),
ev_web_search_call_done("ws-1", "completed", "weather seattle"),
ev_completed("resp-web-1"),
]),
],
)
.await;
let mut resumed_builder = test_codex().with_home(home.clone()).with_config(|config| {
config
.features
.enable(Feature::Sqlite)
.expect("test config should allow feature update");
config
.features
.enable(Feature::MemoryTool)
.expect("test config should allow feature update");
config.memories.max_raw_memories_for_consolidation = 1;
config.memories.disable_on_external_context = true;
});
let resumed = resumed_builder
.resume(&server, home.clone(), rollout_path.clone())
.await?;
let first_phase2_request = wait_for_request(&responses, /*expected_count*/ 1)
.await
.remove(0);
let first_phase2_prompt = phase2_prompt_text(&first_phase2_request);
assert!(
first_phase2_prompt.contains("- selected inputs this run: 1"),
"expected seeded thread to be selected before pollution: {first_phase2_prompt}"
);
assert!(
first_phase2_prompt.contains("- newly added since the last successful Phase 2 run: 1"),
"expected seeded thread to be added before pollution: {first_phase2_prompt}"
);
assert!(
first_phase2_prompt.contains(&format!("- [added] thread_id={thread_id},")),
"expected selected thread in first phase2 prompt: {first_phase2_prompt}"
);
wait_for_phase2_success(db.as_ref(), thread_id).await?;
resumed
.submit_turn("search the web for weather seattle")
.await?;
assert_eq!(
{
let deadline = Instant::now() + Duration::from_secs(10);
loop {
let memory_mode = db.get_thread_memory_mode(thread_id).await?;
if memory_mode.as_deref() == Some("polluted") {
break memory_mode;
}
assert!(
Instant::now() < deadline,
"timed out waiting for polluted memory mode for {thread_id}"
);
tokio::time::sleep(Duration::from_millis(50)).await;
}
}
.as_deref(),
Some("polluted")
);
let selection = {
let deadline = Instant::now() + Duration::from_secs(10);
loop {
let selection = db
.get_phase2_input_selection(/*n*/ 1, /*max_unused_days*/ 30)
.await?;
if selection.selected.is_empty()
&& selection.retained_thread_ids.is_empty()
&& selection.removed.len() == 1
&& selection.removed[0].thread_id == thread_id
{
break selection;
}
assert!(
Instant::now() < deadline,
"timed out waiting for polluted thread to move into removed phase2 inputs: \
{selection:?}"
);
tokio::time::sleep(Duration::from_millis(50)).await;
}
};
assert_eq!(responses.requests().len(), 2);
assert!(selection.selected.is_empty());
assert_eq!(selection.retained_thread_ids, Vec::<ThreadId>::new());
assert_eq!(selection.removed.len(), 1);
assert_eq!(selection.removed[0].thread_id, thread_id);
shutdown_test_codex(&resumed).await?;
Ok(())
}
async fn build_test_codex(server: &wiremock::MockServer, home: Arc<TempDir>) -> Result<TestCodex> {
#[allow(clippy::expect_used)]
let mut builder = test_codex().with_home(home).with_config(|config| {
@@ -560,30 +328,22 @@ fn phase2_prompt_text(request: &ResponsesRequest) -> String {
request
.message_input_texts("user")
.into_iter()
.find(|text| text.contains("Current selected Phase 1 inputs:"))
.find(|text| text.contains("Memory workspace diff:"))
.expect("phase2 prompt text")
}
async fn wait_for_phase2_success(
db: &codex_state::StateRuntime,
expected_thread_id: ThreadId,
) -> Result<()> {
async fn wait_for_phase2_workspace_reset(memory_root: &Path) -> Result<()> {
wait_for_file_removed(&memory_root.join("phase2_workspace_diff.md")).await?;
let deadline = Instant::now() + Duration::from_secs(10);
loop {
let selection = db
.get_phase2_input_selection(/*n*/ 1, /*max_unused_days*/ 30)
.await?;
if selection.selected.len() == 1
&& selection.selected[0].thread_id == expected_thread_id
&& selection.retained_thread_ids == vec![expected_thread_id]
&& selection.removed.is_empty()
if let Ok(diff) = diff_since_latest_init(memory_root).await
&& !diff.has_changes()
{
return Ok(());
}
assert!(
Instant::now() < deadline,
"timed out waiting for phase2 success for {expected_thread_id}"
"timed out waiting for clean memory workspace baseline"
);
tokio::time::sleep(Duration::from_millis(50)).await;
}