Files
codex/codex-rs/core/src/project_doc.rs

366 lines
13 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Project-level documentation discovery.
//!
//! Project-level documentation can be stored in a file named `AGENTS.md`.
//! Currently, we include only the contents of the first file found as follows:
//!
//! 1. Look for the doc file in the current working directory (as determined
//! by the `Config`).
//! 2. If not found, walk *upwards* until the Git repository root is reached
//! (detected by the presence of a `.git` directory/file), or failing that,
//! the filesystem root.
//! 3. If the Git root is encountered, look for the doc file there. If it
//! exists, the search stops we do **not** walk past the Git root.
use crate::config::Config;
use crate::config::find_user_instructions_path;
use std::path::Path;
use std::path::PathBuf;
use tokio::io::AsyncReadExt;
use tracing::error;
/// Currently, we only match the filename `AGENTS.md` exactly.
const CANDIDATE_FILENAMES: &[&str] = &["AGENTS.md"];
/// When both `Config::instructions` and the project doc are present, they will
/// be concatenated with the following separator.
const PROJECT_DOC_SEPARATOR: &str = "\n\n--- project-doc ---\n\n";
/// Combines `Config::instructions` and `AGENTS.md` (if present) into a single
/// string of instructions.
pub(crate) async fn get_user_instructions(config: &Config) -> Option<String> {
match find_project_doc(config).await {
Ok(Some(project_doc)) => match &config.user_instructions {
Some(original_instructions) => Some(format!(
"{original_instructions}{PROJECT_DOC_SEPARATOR}{project_doc}"
)),
None => Some(project_doc),
},
Ok(None) => config.user_instructions.clone(),
Err(e) => {
error!("error trying to find project doc: {e:#}");
config.user_instructions.clone()
}
}
}
/// Attempt to locate and load the project documentation. Currently, the search
/// starts from `Config::cwd`, but if we may want to consider other directories
/// in the future, e.g., additional writable directories in the `SandboxPolicy`.
///
/// On success returns `Ok(Some(contents))`. If no documentation file is found
/// the function returns `Ok(None)`. Unexpected I/O failures bubble up as
/// `Err` so callers can decide how to handle them.
async fn find_project_doc(config: &Config) -> std::io::Result<Option<String>> {
let max_bytes = config.project_doc_max_bytes;
if max_bytes == 0 {
return Ok(None);
}
if let Some(path) = find_project_doc_path_async(config).await {
return read_file_with_limit(&path, max_bytes).await;
}
Ok(None)
}
/// Lightweight description of where user and project instruction files were
/// sourced from. Paths are absolute and only present when a corresponding file
/// exists and is non-empty.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct InstructionsInfo {
pub user_instructions_path: Option<PathBuf>,
pub project_instructions_path: Option<PathBuf>,
}
/// Asynchronously collect the paths of the user instructions (from
/// `CODEX_HOME/AGENTS.md`) and the project instructions (nearest `AGENTS.md`
/// discovered by [`find_project_doc`]'s search algorithm).
///
/// - If `project_doc_max_bytes == 0` we consider project docs disabled and the
/// path will be `None` even if a file exists.
/// - Empty files are treated as "not set".
pub async fn collect_instructions_info(config: &Config) -> InstructionsInfo {
let user_instructions_path = find_user_instructions_path(&config.codex_home);
let project_instructions_path = if config.project_doc_max_bytes == 0 {
None
} else {
find_project_doc_path_async(config).await
};
InstructionsInfo {
user_instructions_path,
project_instructions_path,
}
}
/// Internal helper that mirrors the search performed by [`find_project_doc`] but
/// returns the discovered file path instead of the contents.
async fn find_project_doc_path_async(config: &Config) -> Option<PathBuf> {
// Attempt in cwd first.
if let Some(p) = find_first_candidate_path_async(&config.cwd, CANDIDATE_FILENAMES).await {
return Some(p);
}
// Walk up towards git root, then stop.
let mut dir = match config.cwd.canonicalize() {
Ok(c) => c,
Err(_) => config.cwd.clone(),
};
while let Some(parent) = dir.parent() {
let git_marker = dir.join(".git");
let git_exists = match tokio::fs::metadata(&git_marker).await {
Ok(_) => true,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => false,
Err(_) => false,
};
if git_exists {
return find_first_candidate_path_async(&dir, CANDIDATE_FILENAMES).await;
}
dir = parent.to_path_buf();
}
None
}
async fn find_first_candidate_path_async(dir: &Path, names: &[&str]) -> Option<PathBuf> {
for name in names {
let candidate = dir.join(name);
match tokio::fs::read_to_string(&candidate).await {
Ok(s) if !s.trim().is_empty() => return Some(candidate),
_ => continue,
}
}
None
}
/// Synchronous variant for UI code that isn't async-aware. Mirrors
/// [`collect_instructions_info`] using blocking filesystem APIs.
pub fn collect_instructions_info_sync(config: &Config) -> InstructionsInfo {
let user_instructions_path = find_user_instructions_path(&config.codex_home);
let project_instructions_path = if config.project_doc_max_bytes == 0 {
None
} else {
find_project_doc_path_sync(config)
};
InstructionsInfo {
user_instructions_path,
project_instructions_path,
}
}
fn find_project_doc_path_sync(config: &Config) -> Option<PathBuf> {
// Try cwd first
if let Some(p) = find_first_candidate_path_sync(&config.cwd, CANDIDATE_FILENAMES) {
return Some(p);
}
let mut dir = config
.cwd
.canonicalize()
.unwrap_or_else(|_| config.cwd.clone());
while let Some(parent) = dir.parent() {
let git_marker = dir.join(".git");
let git_exists = match std::fs::metadata(&git_marker) {
Ok(_) => true,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => false,
Err(_) => false,
};
if git_exists {
return find_first_candidate_path_sync(&dir, CANDIDATE_FILENAMES);
}
dir = parent.to_path_buf();
}
None
}
fn find_first_candidate_path_sync(dir: &Path, names: &[&str]) -> Option<PathBuf> {
for name in names {
let candidate = dir.join(name);
match std::fs::read_to_string(&candidate) {
Ok(s) if !s.trim().is_empty() => return Some(candidate),
_ => continue,
}
}
None
}
/// Read a file with a maximum byte limit; empty content returns None.
async fn read_file_with_limit(path: &Path, max_bytes: usize) -> std::io::Result<Option<String>> {
let file = match tokio::fs::File::open(path).await {
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
Err(e) => return Err(e),
Ok(f) => f,
};
let size = file.metadata().await?.len();
let reader = tokio::io::BufReader::new(file);
let mut data = Vec::with_capacity(std::cmp::min(size as usize, max_bytes));
let mut limited = reader.take(max_bytes as u64);
limited.read_to_end(&mut data).await?;
if size as usize > max_bytes {
tracing::warn!(
"Project doc `{}` exceeds {max_bytes} bytes - truncating.",
path.display(),
);
}
let contents = String::from_utf8_lossy(&data).to_string();
if contents.trim().is_empty() {
return Ok(None);
}
Ok(Some(contents))
}
#[cfg(test)]
mod tests {
#![allow(clippy::expect_used, clippy::unwrap_used)]
use super::*;
use crate::config::ConfigOverrides;
use crate::config::ConfigToml;
use std::fs;
use tempfile::TempDir;
/// Helper that returns a `Config` pointing at `root` and using `limit` as
/// the maximum number of bytes to embed from AGENTS.md. The caller can
/// optionally specify a custom `instructions` string when `None` the
/// value is cleared to mimic a scenario where no system instructions have
/// been configured.
fn make_config(root: &TempDir, limit: usize, instructions: Option<&str>) -> Config {
let codex_home = TempDir::new().unwrap();
let mut config = Config::load_from_base_config_with_overrides(
ConfigToml::default(),
ConfigOverrides::default(),
codex_home.path().to_path_buf(),
)
.expect("defaults for test should always succeed");
config.cwd = root.path().to_path_buf();
config.project_doc_max_bytes = limit;
config.user_instructions = instructions.map(ToOwned::to_owned);
config
}
/// AGENTS.md missing should yield `None`.
#[tokio::test]
async fn no_doc_file_returns_none() {
let tmp = tempfile::tempdir().expect("tempdir");
let res = get_user_instructions(&make_config(&tmp, 4096, None)).await;
assert!(
res.is_none(),
"Expected None when AGENTS.md is absent and no system instructions provided"
);
assert!(res.is_none(), "Expected None when AGENTS.md is absent");
}
/// Small file within the byte-limit is returned unmodified.
#[tokio::test]
async fn doc_smaller_than_limit_is_returned() {
let tmp = tempfile::tempdir().expect("tempdir");
fs::write(tmp.path().join("AGENTS.md"), "hello world").unwrap();
let res = get_user_instructions(&make_config(&tmp, 4096, None))
.await
.expect("doc expected");
assert_eq!(
res, "hello world",
"The document should be returned verbatim when it is smaller than the limit and there are no existing instructions"
);
}
/// Oversize file is truncated to `project_doc_max_bytes`.
#[tokio::test]
async fn doc_larger_than_limit_is_truncated() {
const LIMIT: usize = 1024;
let tmp = tempfile::tempdir().expect("tempdir");
let huge = "A".repeat(LIMIT * 2); // 2 KiB
fs::write(tmp.path().join("AGENTS.md"), &huge).unwrap();
let res = get_user_instructions(&make_config(&tmp, LIMIT, None))
.await
.expect("doc expected");
assert_eq!(res.len(), LIMIT, "doc should be truncated to LIMIT bytes");
assert_eq!(res, huge[..LIMIT]);
}
/// When `cwd` is nested inside a repo, the search should locate AGENTS.md
/// placed at the repository root (identified by `.git`).
#[tokio::test]
async fn finds_doc_in_repo_root() {
let repo = tempfile::tempdir().expect("tempdir");
// Simulate a git repository. Note .git can be a file or a directory.
std::fs::write(
repo.path().join(".git"),
"gitdir: /path/to/actual/git/dir\n",
)
.unwrap();
// Put the doc at the repo root.
fs::write(repo.path().join("AGENTS.md"), "root level doc").unwrap();
// Now create a nested working directory: repo/workspace/crate_a
let nested = repo.path().join("workspace/crate_a");
std::fs::create_dir_all(&nested).unwrap();
// Build config pointing at the nested dir.
let mut cfg = make_config(&repo, 4096, None);
cfg.cwd = nested;
let res = get_user_instructions(&cfg).await.expect("doc expected");
assert_eq!(res, "root level doc");
}
/// Explicitly setting the byte-limit to zero disables project docs.
#[tokio::test]
async fn zero_byte_limit_disables_docs() {
let tmp = tempfile::tempdir().expect("tempdir");
fs::write(tmp.path().join("AGENTS.md"), "something").unwrap();
let res = get_user_instructions(&make_config(&tmp, 0, None)).await;
assert!(
res.is_none(),
"With limit 0 the function should return None"
);
}
/// When both system instructions *and* a project doc are present the two
/// should be concatenated with the separator.
#[tokio::test]
async fn merges_existing_instructions_with_project_doc() {
let tmp = tempfile::tempdir().expect("tempdir");
fs::write(tmp.path().join("AGENTS.md"), "proj doc").unwrap();
const INSTRUCTIONS: &str = "base instructions";
let res = get_user_instructions(&make_config(&tmp, 4096, Some(INSTRUCTIONS)))
.await
.expect("should produce a combined instruction string");
let expected = format!("{INSTRUCTIONS}{PROJECT_DOC_SEPARATOR}{}", "proj doc");
assert_eq!(res, expected);
}
/// If there are existing system instructions but the project doc is
/// missing we expect the original instructions to be returned unchanged.
#[tokio::test]
async fn keeps_existing_instructions_when_doc_missing() {
let tmp = tempfile::tempdir().expect("tempdir");
const INSTRUCTIONS: &str = "some instructions";
let res = get_user_instructions(&make_config(&tmp, 4096, Some(INSTRUCTIONS))).await;
assert_eq!(res, Some(INSTRUCTIONS.to_string()));
}
}