mirror of
https://github.com/openai/codex.git
synced 2026-05-23 12:34:25 +00:00
Warn on invalid UTF-8 in AGENTS.md files (#23232)
Fixes #23223. ## Why Malformed AGENTS instructions should not fail silently. The reported issue had invalid UTF-8 in a global `AGENTS.md`; before this change, Codex treated that decode failure like a missing file, so the personal instructions disappeared without a user-visible explanation and the rollout had no `# AGENTS.md instructions` block. Project-level AGENTS files already used lossy decoding, so their instructions still appeared, but invalid bytes were replaced without telling the user. Global and project AGENTS files should behave consistently: keep usable instruction text when possible, and surface a diagnostic when bytes had to be replaced. ## What changed Global `AGENTS.override.md` and `AGENTS.md` loading now reads bytes and decodes with replacement characters on invalid UTF-8, matching project-level AGENTS behavior. Both global and project AGENTS loading now emit a startup warning when invalid UTF-8 is found, and both keep the instruction text with invalid byte sequences replaced. Missing files, non-file candidates, empty files, and the existing `AGENTS.override.md` before `AGENTS.md` precedence keep their current behavior. ## How users see it The warnings flow through the existing startup warning surface. App-server clients receive config-time startup warnings as `configWarning` notifications during initialization, and thread startup emits startup warnings as thread-scoped `warning` notifications. Global AGENTS invalid UTF-8 warnings can appear on both surfaces. Project-level AGENTS invalid UTF-8 warnings are discovered while building thread instructions, so they appear as thread-scoped `warning` notifications. Clients that render warning notifications in the conversation surface show the message as a visible diagnostic instead of silently hiding or altering instructions.
This commit is contained in:
@@ -62,18 +62,31 @@ impl<'a> AgentsMdManager<'a> {
|
||||
pub(crate) async fn load_global_instructions(
|
||||
fs: &dyn ExecutorFileSystem,
|
||||
codex_dir: Option<&AbsolutePathBuf>,
|
||||
startup_warnings: &mut Vec<String>,
|
||||
) -> Option<LoadedAgentsMd> {
|
||||
let base = codex_dir?;
|
||||
for candidate in [LOCAL_AGENTS_MD_FILENAME, DEFAULT_AGENTS_MD_FILENAME] {
|
||||
let path = base.join(candidate);
|
||||
if let Ok(contents) = fs.read_file_text(&path, /*sandbox*/ None).await {
|
||||
let trimmed = contents.trim();
|
||||
if !trimmed.is_empty() {
|
||||
return Some(LoadedAgentsMd {
|
||||
contents: trimmed.to_string(),
|
||||
path,
|
||||
});
|
||||
let data = match fs.read_file(&path, /*sandbox*/ None).await {
|
||||
Ok(data) => data,
|
||||
Err(err) if err.kind() == io::ErrorKind::NotFound => continue,
|
||||
Err(err) if err.kind() == io::ErrorKind::IsADirectory => continue,
|
||||
Err(err) => {
|
||||
startup_warnings.push(format!(
|
||||
"Failed to read global AGENTS.md instructions from `{}`: {err}",
|
||||
path.display()
|
||||
));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
warn_invalid_utf8(&path, &data, "Global", startup_warnings);
|
||||
let contents = String::from_utf8_lossy(&data);
|
||||
let trimmed = contents.trim();
|
||||
if !trimmed.is_empty() {
|
||||
return Some(LoadedAgentsMd {
|
||||
contents: trimmed.to_string(),
|
||||
path,
|
||||
});
|
||||
}
|
||||
}
|
||||
None
|
||||
@@ -84,16 +97,19 @@ impl<'a> AgentsMdManager<'a> {
|
||||
pub(crate) async fn user_instructions(
|
||||
&self,
|
||||
environment: Option<&Environment>,
|
||||
startup_warnings: &mut Vec<String>,
|
||||
) -> Option<String> {
|
||||
let fs = environment?.get_filesystem();
|
||||
self.user_instructions_with_fs(fs.as_ref()).await
|
||||
self.user_instructions_with_fs(fs.as_ref(), startup_warnings)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn user_instructions_with_fs(
|
||||
async fn user_instructions_with_fs(
|
||||
&self,
|
||||
fs: &dyn ExecutorFileSystem,
|
||||
startup_warnings: &mut Vec<String>,
|
||||
) -> Option<String> {
|
||||
let agents_md_docs = self.read_agents_md(fs).await;
|
||||
let agents_md_docs = self.read_agents_md(fs, startup_warnings).await;
|
||||
|
||||
let mut output = String::new();
|
||||
|
||||
@@ -130,11 +146,15 @@ impl<'a> AgentsMdManager<'a> {
|
||||
|
||||
/// Returns all instruction source files included in the current config.
|
||||
pub async fn instruction_sources(&self, fs: &dyn ExecutorFileSystem) -> Vec<AbsolutePathBuf> {
|
||||
let mut paths =
|
||||
Self::load_global_instructions(LOCAL_FS.as_ref(), Some(&self.config.codex_home))
|
||||
.await
|
||||
.map(|loaded| vec![loaded.path])
|
||||
.unwrap_or_default();
|
||||
let mut global_instruction_warnings = Vec::new();
|
||||
let mut paths = Self::load_global_instructions(
|
||||
LOCAL_FS.as_ref(),
|
||||
Some(&self.config.codex_home),
|
||||
&mut global_instruction_warnings,
|
||||
)
|
||||
.await
|
||||
.map(|loaded| vec![loaded.path])
|
||||
.unwrap_or_default();
|
||||
match self.agents_md_paths(fs).await {
|
||||
Ok(agents_md_paths) => paths.extend(agents_md_paths),
|
||||
Err(err) => {
|
||||
@@ -150,7 +170,11 @@ impl<'a> AgentsMdManager<'a> {
|
||||
/// concatenation of all discovered docs. If no documentation file is found
|
||||
/// the function returns `Ok(None)`. Unexpected I/O failures bubble up as
|
||||
/// `Err` so callers can decide how to handle them.
|
||||
async fn read_agents_md(&self, fs: &dyn ExecutorFileSystem) -> io::Result<Option<String>> {
|
||||
async fn read_agents_md(
|
||||
&self,
|
||||
fs: &dyn ExecutorFileSystem,
|
||||
startup_warnings: &mut Vec<String>,
|
||||
) -> io::Result<Option<String>> {
|
||||
let max_total = self.config.project_doc_max_bytes;
|
||||
|
||||
if max_total == 0 {
|
||||
@@ -182,6 +206,8 @@ impl<'a> AgentsMdManager<'a> {
|
||||
Err(err) if err.kind() == io::ErrorKind::NotFound => continue,
|
||||
Err(err) => return Err(err),
|
||||
};
|
||||
warn_invalid_utf8(&p, &data, "Project", startup_warnings);
|
||||
|
||||
let size = data.len() as u64;
|
||||
if size > remaining {
|
||||
data.truncate(remaining as usize);
|
||||
@@ -324,6 +350,20 @@ impl<'a> AgentsMdManager<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn warn_invalid_utf8(
|
||||
path: &AbsolutePathBuf,
|
||||
data: &[u8],
|
||||
source: &str,
|
||||
startup_warnings: &mut Vec<String>,
|
||||
) {
|
||||
if let Err(err) = std::str::from_utf8(data) {
|
||||
startup_warnings.push(format!(
|
||||
"{source} AGENTS.md instructions from `{}` contain invalid UTF-8: {err}. Invalid byte sequences were replaced.",
|
||||
path.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "agents_md_tests.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -7,12 +7,14 @@ use core_test_support::PathBufExt;
|
||||
use core_test_support::TempDirExt;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
async fn get_user_instructions(config: &Config) -> Option<String> {
|
||||
let mut warnings = Vec::new();
|
||||
AgentsMdManager::new(config)
|
||||
.user_instructions_with_fs(LOCAL_FS.as_ref())
|
||||
.user_instructions_with_fs(LOCAL_FS.as_ref(), &mut warnings)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -22,6 +24,19 @@ async fn agents_md_paths(config: &Config) -> std::io::Result<Vec<AbsolutePathBuf
|
||||
.await
|
||||
}
|
||||
|
||||
fn assert_invalid_utf8_warning(warnings: &[String], source: &str, path: &Path) {
|
||||
let path_display = path.display().to_string();
|
||||
assert_eq!(warnings.len(), 1, "expected one warning, got {warnings:?}");
|
||||
let warning = &warnings[0];
|
||||
assert!(
|
||||
warning.contains(&format!("{source} AGENTS.md instructions"))
|
||||
&& warning.contains(&path_display)
|
||||
&& warning.contains("invalid UTF-8")
|
||||
&& warning.contains("Invalid byte sequences were replaced."),
|
||||
"unexpected invalid UTF-8 warning: {warning:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Helper that returns a `Config` pointing at `root` and using `limit` as
|
||||
/// the maximum number of bytes to embed from AGENTS.md. The caller can
|
||||
/// optionally specify a custom `instructions` string – when `None` the
|
||||
@@ -105,8 +120,9 @@ async fn no_environment_returns_none() {
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let config = make_config(&tmp, /*limit*/ 4096, Some("user instructions")).await;
|
||||
|
||||
let mut warnings = Vec::new();
|
||||
let res = AgentsMdManager::new(&config)
|
||||
.user_instructions(/*environment*/ None)
|
||||
.user_instructions(/*environment*/ None, &mut warnings)
|
||||
.await;
|
||||
|
||||
assert_eq!(res, None);
|
||||
@@ -129,6 +145,45 @@ async fn doc_smaller_than_limit_is_returned() {
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn global_doc_invalid_utf8_warns_and_uses_lossy_text() {
|
||||
let codex_home = tempfile::tempdir().expect("tempdir");
|
||||
let codex_home_abs = codex_home.abs();
|
||||
let path = codex_home_abs.join(DEFAULT_AGENTS_MD_FILENAME);
|
||||
fs::write(&path, b"global\xFF doc").unwrap();
|
||||
|
||||
let mut warnings = Vec::new();
|
||||
let loaded = AgentsMdManager::load_global_instructions(
|
||||
LOCAL_FS.as_ref(),
|
||||
Some(&codex_home_abs),
|
||||
&mut warnings,
|
||||
)
|
||||
.await
|
||||
.expect("global doc expected");
|
||||
|
||||
assert_eq!(loaded.contents, "global\u{FFFD} doc");
|
||||
assert_eq!(loaded.path, path);
|
||||
assert_invalid_utf8_warning(&warnings, "Global", path.as_path());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn project_doc_invalid_utf8_warns_and_uses_lossy_text() {
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let path = tmp.path().join("AGENTS.md");
|
||||
fs::write(&path, b"project\xFF doc").unwrap();
|
||||
|
||||
let config = make_config(&tmp, /*limit*/ 4096, /*instructions*/ None).await;
|
||||
let mut warnings = Vec::new();
|
||||
let res = AgentsMdManager::new(&config)
|
||||
.user_instructions_with_fs(LOCAL_FS.as_ref(), &mut warnings)
|
||||
.await
|
||||
.expect("doc expected");
|
||||
|
||||
assert_eq!(res, "project\u{FFFD} doc");
|
||||
let canonical_path = dunce::canonicalize(&path).expect("canonical doc path");
|
||||
assert_invalid_utf8_warning(&warnings, "Project", &canonical_path);
|
||||
}
|
||||
|
||||
/// Oversize file is truncated to `project_doc_max_bytes`.
|
||||
#[tokio::test]
|
||||
async fn doc_larger_than_limit_is_truncated() {
|
||||
|
||||
@@ -2438,14 +2438,17 @@ impl Config {
|
||||
guardian_policy_config_source: _,
|
||||
} = config_layer_stack.requirements().clone();
|
||||
|
||||
let user_instructions =
|
||||
AgentsMdManager::load_global_instructions(LOCAL_FS.as_ref(), Some(&codex_home))
|
||||
.await
|
||||
.map(|loaded| loaded.contents);
|
||||
let mut startup_warnings = config_layer_stack
|
||||
.startup_warnings()
|
||||
.unwrap_or_default()
|
||||
.to_vec();
|
||||
let user_instructions = AgentsMdManager::load_global_instructions(
|
||||
LOCAL_FS.as_ref(),
|
||||
Some(&codex_home),
|
||||
&mut startup_warnings,
|
||||
)
|
||||
.await
|
||||
.map(|loaded| loaded.contents);
|
||||
|
||||
// Destructure ConfigOverrides fully to ensure all overrides are applied.
|
||||
let ConfigOverrides {
|
||||
|
||||
@@ -490,9 +490,14 @@ impl Codex {
|
||||
}
|
||||
|
||||
let primary_environment = environment_selections.primary_environment();
|
||||
let mut user_instruction_warnings = Vec::new();
|
||||
let user_instructions = AgentsMdManager::new(&config)
|
||||
.user_instructions(primary_environment.as_deref())
|
||||
.user_instructions(
|
||||
primary_environment.as_deref(),
|
||||
&mut user_instruction_warnings,
|
||||
)
|
||||
.await;
|
||||
config.startup_warnings.extend(user_instruction_warnings);
|
||||
|
||||
let exec_policy = if crate::guardian::is_guardian_reviewer_source(&session_source) {
|
||||
// Guardian review should rely on the built-in shell safety checks,
|
||||
|
||||
Reference in New Issue
Block a user