feat: warning large commits (#6838)

This commit is contained in:
jif-oai
2025-11-19 10:22:10 +00:00
committed by GitHub
parent 73488657cb
commit 956d3bfac6
3 changed files with 257 additions and 1 deletions

View File

@@ -1,10 +1,14 @@
use crate::codex::TurnContext;
use crate::protocol::EventMsg;
use crate::protocol::WarningEvent;
use crate::state::TaskKind;
use crate::tasks::SessionTask;
use crate::tasks::SessionTaskContext;
use async_trait::async_trait;
use codex_git::CreateGhostCommitOptions;
use codex_git::GhostSnapshotReport;
use codex_git::GitToolingError;
use codex_git::capture_ghost_snapshot_report;
use codex_git::create_ghost_commit;
use codex_protocol::models::ResponseItem;
use codex_protocol::user_input::UserInput;
@@ -39,6 +43,27 @@ impl SessionTask for GhostSnapshotTask {
_ = cancellation_token.cancelled() => true,
_ = async {
let repo_path = ctx_for_task.cwd.clone();
// First, compute a snapshot report so we can warn about
// large untracked directories before running the heavier
// snapshot logic.
if let Ok(Ok(report)) = tokio::task::spawn_blocking({
let repo_path = repo_path.clone();
move || {
let options = CreateGhostCommitOptions::new(&repo_path);
capture_ghost_snapshot_report(&options)
}
})
.await
&& let Some(message) = format_large_untracked_warning(&report) {
session
.session
.send_event(
&ctx_for_task,
EventMsg::Warning(WarningEvent { message }),
)
.await;
}
// Required to run in a dedicated blocking pool.
match tokio::task::spawn_blocking(move || {
let options = CreateGhostCommitOptions::new(&repo_path);
@@ -103,3 +128,22 @@ impl GhostSnapshotTask {
Self { token }
}
}
fn format_large_untracked_warning(report: &GhostSnapshotReport) -> Option<String> {
if report.large_untracked_dirs.is_empty() {
return None;
}
const MAX_DIRS: usize = 3;
let mut parts: Vec<String> = Vec::new();
for dir in report.large_untracked_dirs.iter().take(MAX_DIRS) {
parts.push(format!("{} ({} files)", dir.path.display(), dir.file_count));
}
if report.large_untracked_dirs.len() > MAX_DIRS {
let remaining = report.large_untracked_dirs.len() - MAX_DIRS;
parts.push(format!("{remaining} more"));
}
Some(format!(
"Repository snapshot encountered large untracked directories: {}. This can slow Codex; consider adding these paths to .gitignore or disabling undo in your config.",
parts.join(", ")
))
}

View File

@@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::ffi::OsString;
use std::fs;
@@ -21,6 +22,8 @@ use crate::operations::run_git_for_stdout_all;
/// Default commit message used for ghost commits when none is provided.
const DEFAULT_COMMIT_MESSAGE: &str = "codex snapshot";
/// Default threshold that triggers a warning about large untracked directories.
const LARGE_UNTRACKED_WARNING_THRESHOLD: usize = 200;
/// Options to control ghost commit creation.
pub struct CreateGhostCommitOptions<'a> {
@@ -29,6 +32,19 @@ pub struct CreateGhostCommitOptions<'a> {
pub force_include: Vec<PathBuf>,
}
/// Summary produced alongside a ghost snapshot.
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct GhostSnapshotReport {
pub large_untracked_dirs: Vec<LargeUntrackedDir>,
}
/// Directory containing a large amount of untracked content.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LargeUntrackedDir {
pub path: PathBuf,
pub file_count: usize,
}
impl<'a> CreateGhostCommitOptions<'a> {
/// Creates options scoped to the provided repository path.
pub fn new(repo_path: &'a Path) -> Self {
@@ -64,10 +80,94 @@ impl<'a> CreateGhostCommitOptions<'a> {
}
}
fn detect_large_untracked_dirs(files: &[PathBuf], dirs: &[PathBuf]) -> Vec<LargeUntrackedDir> {
let mut counts: BTreeMap<PathBuf, usize> = BTreeMap::new();
let mut sorted_dirs: Vec<&PathBuf> = dirs.iter().collect();
sorted_dirs.sort_by(|a, b| {
let a_components = a.components().count();
let b_components = b.components().count();
b_components.cmp(&a_components).then_with(|| a.cmp(b))
});
for file in files {
let mut key: Option<PathBuf> = None;
for dir in &sorted_dirs {
if file.starts_with(dir.as_path()) {
key = Some((*dir).clone());
break;
}
}
let key = key.unwrap_or_else(|| {
file.parent()
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("."))
});
let entry = counts.entry(key).or_insert(0);
*entry += 1;
}
let mut result: Vec<LargeUntrackedDir> = counts
.into_iter()
.filter(|(_, count)| *count >= LARGE_UNTRACKED_WARNING_THRESHOLD)
.map(|(path, file_count)| LargeUntrackedDir { path, file_count })
.collect();
result.sort_by(|a, b| {
b.file_count
.cmp(&a.file_count)
.then_with(|| a.path.cmp(&b.path))
});
result
}
fn to_session_relative_path(path: &Path, repo_prefix: Option<&Path>) -> PathBuf {
match repo_prefix {
Some(prefix) => path
.strip_prefix(prefix)
.map(PathBuf::from)
.unwrap_or_else(|_| path.to_path_buf()),
None => path.to_path_buf(),
}
}
/// Create a ghost commit capturing the current state of the repository's working tree.
pub fn create_ghost_commit(
options: &CreateGhostCommitOptions<'_>,
) -> Result<GhostCommit, GitToolingError> {
create_ghost_commit_with_report(options).map(|(commit, _)| commit)
}
/// Compute a report describing the working tree for a ghost snapshot without creating a commit.
pub fn capture_ghost_snapshot_report(
options: &CreateGhostCommitOptions<'_>,
) -> Result<GhostSnapshotReport, GitToolingError> {
ensure_git_repository(options.repo_path)?;
let repo_root = resolve_repository_root(options.repo_path)?;
let repo_prefix = repo_subdir(repo_root.as_path(), options.repo_path);
let existing_untracked =
capture_existing_untracked(repo_root.as_path(), repo_prefix.as_deref())?;
let warning_files = existing_untracked
.files
.iter()
.map(|path| to_session_relative_path(path, repo_prefix.as_deref()))
.collect::<Vec<_>>();
let warning_dirs = existing_untracked
.dirs
.iter()
.map(|path| to_session_relative_path(path, repo_prefix.as_deref()))
.collect::<Vec<_>>();
Ok(GhostSnapshotReport {
large_untracked_dirs: detect_large_untracked_dirs(&warning_files, &warning_dirs),
})
}
/// Create a ghost commit capturing the current state of the repository's working tree along with a report.
pub fn create_ghost_commit_with_report(
options: &CreateGhostCommitOptions<'_>,
) -> Result<(GhostCommit, GhostSnapshotReport), GitToolingError> {
ensure_git_repository(options.repo_path)?;
let repo_root = resolve_repository_root(options.repo_path)?;
@@ -76,6 +176,18 @@ pub fn create_ghost_commit(
let existing_untracked =
capture_existing_untracked(repo_root.as_path(), repo_prefix.as_deref())?;
let warning_files = existing_untracked
.files
.iter()
.map(|path| to_session_relative_path(path, repo_prefix.as_deref()))
.collect::<Vec<_>>();
let warning_dirs = existing_untracked
.dirs
.iter()
.map(|path| to_session_relative_path(path, repo_prefix.as_deref()))
.collect::<Vec<_>>();
let large_untracked_dirs = detect_large_untracked_dirs(&warning_files, &warning_dirs);
let normalized_force = options
.force_include
.iter()
@@ -143,11 +255,18 @@ pub fn create_ghost_commit(
Some(commit_env.as_slice()),
)?;
Ok(GhostCommit::new(
let ghost_commit = GhostCommit::new(
commit_id,
parent,
existing_untracked.files,
existing_untracked.dirs,
);
Ok((
ghost_commit,
GhostSnapshotReport {
large_untracked_dirs,
},
))
}
@@ -460,6 +579,95 @@ mod tests {
Ok(())
}
#[test]
fn create_snapshot_reports_large_untracked_dirs() -> Result<(), GitToolingError> {
let temp = tempfile::tempdir()?;
let repo = temp.path();
init_test_repo(repo);
std::fs::write(repo.join("tracked.txt"), "contents\n")?;
run_git_in(repo, &["add", "tracked.txt"]);
run_git_in(
repo,
&[
"-c",
"user.name=Tester",
"-c",
"user.email=test@example.com",
"commit",
"-m",
"initial",
],
);
let models = repo.join("models");
std::fs::create_dir(&models)?;
for idx in 0..(LARGE_UNTRACKED_WARNING_THRESHOLD + 1) {
let file = models.join(format!("weights-{idx}.bin"));
std::fs::write(file, "data\n")?;
}
let (ghost, report) =
create_ghost_commit_with_report(&CreateGhostCommitOptions::new(repo))?;
assert!(ghost.parent().is_some());
assert_eq!(
report.large_untracked_dirs,
vec![LargeUntrackedDir {
path: PathBuf::from("models"),
file_count: LARGE_UNTRACKED_WARNING_THRESHOLD + 1,
}]
);
Ok(())
}
#[test]
fn create_snapshot_reports_nested_large_untracked_dirs_under_tracked_parent()
-> Result<(), GitToolingError> {
let temp = tempfile::tempdir()?;
let repo = temp.path();
init_test_repo(repo);
// Create a tracked src directory.
let src = repo.join("src");
std::fs::create_dir(&src)?;
std::fs::write(src.join("main.rs"), "fn main() {}\n")?;
run_git_in(repo, &["add", "src/main.rs"]);
run_git_in(
repo,
&[
"-c",
"user.name=Tester",
"-c",
"user.email=test@example.com",
"commit",
"-m",
"initial",
],
);
// Create a large untracked tree nested under the tracked src directory.
let generated = src.join("generated").join("cache");
std::fs::create_dir_all(&generated)?;
for idx in 0..(LARGE_UNTRACKED_WARNING_THRESHOLD + 1) {
let file = generated.join(format!("file-{idx}.bin"));
std::fs::write(file, "data\n")?;
}
let (_, report) = create_ghost_commit_with_report(&CreateGhostCommitOptions::new(repo))?;
assert_eq!(report.large_untracked_dirs.len(), 1);
let entry = &report.large_untracked_dirs[0];
assert_ne!(entry.path, PathBuf::from("src"));
assert!(
entry.path.starts_with(Path::new("src/generated")),
"unexpected path for large untracked directory: {}",
entry.path.display()
);
assert_eq!(entry.file_count, LARGE_UNTRACKED_WARNING_THRESHOLD + 1);
Ok(())
}
#[test]
/// Ensures ghost commits succeed in repositories without an existing HEAD.
fn create_snapshot_without_existing_head() -> Result<(), GitToolingError> {

View File

@@ -17,7 +17,11 @@ pub use apply::stage_paths;
pub use branch::merge_base_with_head;
pub use errors::GitToolingError;
pub use ghost_commits::CreateGhostCommitOptions;
pub use ghost_commits::GhostSnapshotReport;
pub use ghost_commits::LargeUntrackedDir;
pub use ghost_commits::capture_ghost_snapshot_report;
pub use ghost_commits::create_ghost_commit;
pub use ghost_commits::create_ghost_commit_with_report;
pub use ghost_commits::restore_ghost_commit;
pub use ghost_commits::restore_to_commit;
pub use platform::create_symlink;