Warn on invalid UTF-8 in AGENTS.md files (#23232)

Fixes #23223.

## Why

Malformed AGENTS instructions should not fail silently. The reported
issue had invalid UTF-8 in a global `AGENTS.md`; before this change,
Codex treated that decode failure like a missing file, so the personal
instructions disappeared without a user-visible explanation and the
rollout had no `# AGENTS.md instructions` block.

Project-level AGENTS files already used lossy decoding, so their
instructions still appeared, but invalid bytes were replaced without
telling the user. Global and project AGENTS files should behave
consistently: keep usable instruction text when possible, and surface a
diagnostic when bytes had to be replaced.

## What changed

Global `AGENTS.override.md` and `AGENTS.md` loading now reads bytes and
decodes with replacement characters on invalid UTF-8, matching
project-level AGENTS behavior. Both global and project AGENTS loading
now emit a startup warning when invalid UTF-8 is found, and both keep
the instruction text with invalid byte sequences replaced.

Missing files, non-file candidates, empty files, and the existing
`AGENTS.override.md` before `AGENTS.md` precedence keep their current
behavior.

## How users see it

The warnings flow through the existing startup warning surface.
App-server clients receive config-time startup warnings as
`configWarning` notifications during initialization, and thread startup
emits startup warnings as thread-scoped `warning` notifications.

Global AGENTS invalid UTF-8 warnings can appear on both surfaces.
Project-level AGENTS invalid UTF-8 warnings are discovered while
building thread instructions, so they appear as thread-scoped `warning`
notifications. Clients that render warning notifications in the
conversation surface show the message as a visible diagnostic instead of
silently hiding or altering instructions.
This commit is contained in:
Eric Traut
2026-05-19 21:56:46 -07:00
committed by GitHub
parent 5a4202ad90
commit 9dda71dbae
4 changed files with 126 additions and 23 deletions

View File

@@ -62,18 +62,31 @@ impl<'a> AgentsMdManager<'a> {
pub(crate) async fn load_global_instructions(
fs: &dyn ExecutorFileSystem,
codex_dir: Option<&AbsolutePathBuf>,
startup_warnings: &mut Vec<String>,
) -> Option<LoadedAgentsMd> {
let base = codex_dir?;
for candidate in [LOCAL_AGENTS_MD_FILENAME, DEFAULT_AGENTS_MD_FILENAME] {
let path = base.join(candidate);
if let Ok(contents) = fs.read_file_text(&path, /*sandbox*/ None).await {
let trimmed = contents.trim();
if !trimmed.is_empty() {
return Some(LoadedAgentsMd {
contents: trimmed.to_string(),
path,
});
let data = match fs.read_file(&path, /*sandbox*/ None).await {
Ok(data) => data,
Err(err) if err.kind() == io::ErrorKind::NotFound => continue,
Err(err) if err.kind() == io::ErrorKind::IsADirectory => continue,
Err(err) => {
startup_warnings.push(format!(
"Failed to read global AGENTS.md instructions from `{}`: {err}",
path.display()
));
continue;
}
};
warn_invalid_utf8(&path, &data, "Global", startup_warnings);
let contents = String::from_utf8_lossy(&data);
let trimmed = contents.trim();
if !trimmed.is_empty() {
return Some(LoadedAgentsMd {
contents: trimmed.to_string(),
path,
});
}
}
None
@@ -84,16 +97,19 @@ impl<'a> AgentsMdManager<'a> {
pub(crate) async fn user_instructions(
&self,
environment: Option<&Environment>,
startup_warnings: &mut Vec<String>,
) -> Option<String> {
let fs = environment?.get_filesystem();
self.user_instructions_with_fs(fs.as_ref()).await
self.user_instructions_with_fs(fs.as_ref(), startup_warnings)
.await
}
pub(crate) async fn user_instructions_with_fs(
async fn user_instructions_with_fs(
&self,
fs: &dyn ExecutorFileSystem,
startup_warnings: &mut Vec<String>,
) -> Option<String> {
let agents_md_docs = self.read_agents_md(fs).await;
let agents_md_docs = self.read_agents_md(fs, startup_warnings).await;
let mut output = String::new();
@@ -130,11 +146,15 @@ impl<'a> AgentsMdManager<'a> {
/// Returns all instruction source files included in the current config.
pub async fn instruction_sources(&self, fs: &dyn ExecutorFileSystem) -> Vec<AbsolutePathBuf> {
let mut paths =
Self::load_global_instructions(LOCAL_FS.as_ref(), Some(&self.config.codex_home))
.await
.map(|loaded| vec![loaded.path])
.unwrap_or_default();
let mut global_instruction_warnings = Vec::new();
let mut paths = Self::load_global_instructions(
LOCAL_FS.as_ref(),
Some(&self.config.codex_home),
&mut global_instruction_warnings,
)
.await
.map(|loaded| vec![loaded.path])
.unwrap_or_default();
match self.agents_md_paths(fs).await {
Ok(agents_md_paths) => paths.extend(agents_md_paths),
Err(err) => {
@@ -150,7 +170,11 @@ impl<'a> AgentsMdManager<'a> {
/// concatenation of all discovered docs. If no documentation file is found
/// the function returns `Ok(None)`. Unexpected I/O failures bubble up as
/// `Err` so callers can decide how to handle them.
async fn read_agents_md(&self, fs: &dyn ExecutorFileSystem) -> io::Result<Option<String>> {
async fn read_agents_md(
&self,
fs: &dyn ExecutorFileSystem,
startup_warnings: &mut Vec<String>,
) -> io::Result<Option<String>> {
let max_total = self.config.project_doc_max_bytes;
if max_total == 0 {
@@ -182,6 +206,8 @@ impl<'a> AgentsMdManager<'a> {
Err(err) if err.kind() == io::ErrorKind::NotFound => continue,
Err(err) => return Err(err),
};
warn_invalid_utf8(&p, &data, "Project", startup_warnings);
let size = data.len() as u64;
if size > remaining {
data.truncate(remaining as usize);
@@ -324,6 +350,20 @@ impl<'a> AgentsMdManager<'a> {
}
}
fn warn_invalid_utf8(
path: &AbsolutePathBuf,
data: &[u8],
source: &str,
startup_warnings: &mut Vec<String>,
) {
if let Err(err) = std::str::from_utf8(data) {
startup_warnings.push(format!(
"{source} AGENTS.md instructions from `{}` contain invalid UTF-8: {err}. Invalid byte sequences were replaced.",
path.display()
));
}
}
#[cfg(test)]
#[path = "agents_md_tests.rs"]
mod tests;

View File

@@ -7,12 +7,14 @@ use core_test_support::PathBufExt;
use core_test_support::TempDirExt;
use pretty_assertions::assert_eq;
use std::fs;
use std::path::Path;
use std::path::PathBuf;
use tempfile::TempDir;
async fn get_user_instructions(config: &Config) -> Option<String> {
let mut warnings = Vec::new();
AgentsMdManager::new(config)
.user_instructions_with_fs(LOCAL_FS.as_ref())
.user_instructions_with_fs(LOCAL_FS.as_ref(), &mut warnings)
.await
}
@@ -22,6 +24,19 @@ async fn agents_md_paths(config: &Config) -> std::io::Result<Vec<AbsolutePathBuf
.await
}
fn assert_invalid_utf8_warning(warnings: &[String], source: &str, path: &Path) {
let path_display = path.display().to_string();
assert_eq!(warnings.len(), 1, "expected one warning, got {warnings:?}");
let warning = &warnings[0];
assert!(
warning.contains(&format!("{source} AGENTS.md instructions"))
&& warning.contains(&path_display)
&& warning.contains("invalid UTF-8")
&& warning.contains("Invalid byte sequences were replaced."),
"unexpected invalid UTF-8 warning: {warning:?}"
);
}
/// Helper that returns a `Config` pointing at `root` and using `limit` as
/// the maximum number of bytes to embed from AGENTS.md. The caller can
/// optionally specify a custom `instructions` string when `None` the
@@ -105,8 +120,9 @@ async fn no_environment_returns_none() {
let tmp = tempfile::tempdir().expect("tempdir");
let config = make_config(&tmp, /*limit*/ 4096, Some("user instructions")).await;
let mut warnings = Vec::new();
let res = AgentsMdManager::new(&config)
.user_instructions(/*environment*/ None)
.user_instructions(/*environment*/ None, &mut warnings)
.await;
assert_eq!(res, None);
@@ -129,6 +145,45 @@ async fn doc_smaller_than_limit_is_returned() {
);
}
#[tokio::test]
async fn global_doc_invalid_utf8_warns_and_uses_lossy_text() {
let codex_home = tempfile::tempdir().expect("tempdir");
let codex_home_abs = codex_home.abs();
let path = codex_home_abs.join(DEFAULT_AGENTS_MD_FILENAME);
fs::write(&path, b"global\xFF doc").unwrap();
let mut warnings = Vec::new();
let loaded = AgentsMdManager::load_global_instructions(
LOCAL_FS.as_ref(),
Some(&codex_home_abs),
&mut warnings,
)
.await
.expect("global doc expected");
assert_eq!(loaded.contents, "global\u{FFFD} doc");
assert_eq!(loaded.path, path);
assert_invalid_utf8_warning(&warnings, "Global", path.as_path());
}
#[tokio::test]
async fn project_doc_invalid_utf8_warns_and_uses_lossy_text() {
let tmp = tempfile::tempdir().expect("tempdir");
let path = tmp.path().join("AGENTS.md");
fs::write(&path, b"project\xFF doc").unwrap();
let config = make_config(&tmp, /*limit*/ 4096, /*instructions*/ None).await;
let mut warnings = Vec::new();
let res = AgentsMdManager::new(&config)
.user_instructions_with_fs(LOCAL_FS.as_ref(), &mut warnings)
.await
.expect("doc expected");
assert_eq!(res, "project\u{FFFD} doc");
let canonical_path = dunce::canonicalize(&path).expect("canonical doc path");
assert_invalid_utf8_warning(&warnings, "Project", &canonical_path);
}
/// Oversize file is truncated to `project_doc_max_bytes`.
#[tokio::test]
async fn doc_larger_than_limit_is_truncated() {

View File

@@ -2438,14 +2438,17 @@ impl Config {
guardian_policy_config_source: _,
} = config_layer_stack.requirements().clone();
let user_instructions =
AgentsMdManager::load_global_instructions(LOCAL_FS.as_ref(), Some(&codex_home))
.await
.map(|loaded| loaded.contents);
let mut startup_warnings = config_layer_stack
.startup_warnings()
.unwrap_or_default()
.to_vec();
let user_instructions = AgentsMdManager::load_global_instructions(
LOCAL_FS.as_ref(),
Some(&codex_home),
&mut startup_warnings,
)
.await
.map(|loaded| loaded.contents);
// Destructure ConfigOverrides fully to ensure all overrides are applied.
let ConfigOverrides {

View File

@@ -490,9 +490,14 @@ impl Codex {
}
let primary_environment = environment_selections.primary_environment();
let mut user_instruction_warnings = Vec::new();
let user_instructions = AgentsMdManager::new(&config)
.user_instructions(primary_environment.as_deref())
.user_instructions(
primary_environment.as_deref(),
&mut user_instruction_warnings,
)
.await;
config.startup_warnings.extend(user_instruction_warnings);
let exec_policy = if crate::guardian::is_guardian_reviewer_source(&session_source) {
// Guardian review should rely on the built-in shell safety checks,