From 9dda71dbae98976bc6d0c1c769810bcf24bb6092 Mon Sep 17 00:00:00 2001 From: Eric Traut Date: Tue, 19 May 2026 21:56:46 -0700 Subject: [PATCH] Warn on invalid UTF-8 in AGENTS.md files (#23232) Fixes #23223. ## Why Malformed AGENTS instructions should not fail silently. The reported issue had invalid UTF-8 in a global `AGENTS.md`; before this change, Codex treated that decode failure like a missing file, so the personal instructions disappeared without a user-visible explanation and the rollout had no `# AGENTS.md instructions` block. Project-level AGENTS files already used lossy decoding, so their instructions still appeared, but invalid bytes were replaced without telling the user. Global and project AGENTS files should behave consistently: keep usable instruction text when possible, and surface a diagnostic when bytes had to be replaced. ## What changed Global `AGENTS.override.md` and `AGENTS.md` loading now reads bytes and decodes with replacement characters on invalid UTF-8, matching project-level AGENTS behavior. Both global and project AGENTS loading now emit a startup warning when invalid UTF-8 is found, and both keep the instruction text with invalid byte sequences replaced. Missing files, non-file candidates, empty files, and the existing `AGENTS.override.md` before `AGENTS.md` precedence keep their current behavior. ## How users see it The warnings flow through the existing startup warning surface. App-server clients receive config-time startup warnings as `configWarning` notifications during initialization, and thread startup emits startup warnings as thread-scoped `warning` notifications. Global AGENTS invalid UTF-8 warnings can appear on both surfaces. Project-level AGENTS invalid UTF-8 warnings are discovered while building thread instructions, so they appear as thread-scoped `warning` notifications. Clients that render warning notifications in the conversation surface show the message as a visible diagnostic instead of silently hiding or altering instructions. --- codex-rs/core/src/agents_md.rs | 72 +++++++++++++++++++++------- codex-rs/core/src/agents_md_tests.rs | 59 ++++++++++++++++++++++- codex-rs/core/src/config/mod.rs | 11 +++-- codex-rs/core/src/session/mod.rs | 7 ++- 4 files changed, 126 insertions(+), 23 deletions(-) diff --git a/codex-rs/core/src/agents_md.rs b/codex-rs/core/src/agents_md.rs index 2c635e7002..6daaff4a40 100644 --- a/codex-rs/core/src/agents_md.rs +++ b/codex-rs/core/src/agents_md.rs @@ -62,18 +62,31 @@ impl<'a> AgentsMdManager<'a> { pub(crate) async fn load_global_instructions( fs: &dyn ExecutorFileSystem, codex_dir: Option<&AbsolutePathBuf>, + startup_warnings: &mut Vec, ) -> Option { let base = codex_dir?; for candidate in [LOCAL_AGENTS_MD_FILENAME, DEFAULT_AGENTS_MD_FILENAME] { let path = base.join(candidate); - if let Ok(contents) = fs.read_file_text(&path, /*sandbox*/ None).await { - let trimmed = contents.trim(); - if !trimmed.is_empty() { - return Some(LoadedAgentsMd { - contents: trimmed.to_string(), - path, - }); + let data = match fs.read_file(&path, /*sandbox*/ None).await { + Ok(data) => data, + Err(err) if err.kind() == io::ErrorKind::NotFound => continue, + Err(err) if err.kind() == io::ErrorKind::IsADirectory => continue, + Err(err) => { + startup_warnings.push(format!( + "Failed to read global AGENTS.md instructions from `{}`: {err}", + path.display() + )); + continue; } + }; + warn_invalid_utf8(&path, &data, "Global", startup_warnings); + let contents = String::from_utf8_lossy(&data); + let trimmed = contents.trim(); + if !trimmed.is_empty() { + return Some(LoadedAgentsMd { + contents: trimmed.to_string(), + path, + }); } } None @@ -84,16 +97,19 @@ impl<'a> AgentsMdManager<'a> { pub(crate) async fn user_instructions( &self, environment: Option<&Environment>, + startup_warnings: &mut Vec, ) -> Option { let fs = environment?.get_filesystem(); - self.user_instructions_with_fs(fs.as_ref()).await + self.user_instructions_with_fs(fs.as_ref(), startup_warnings) + .await } - pub(crate) async fn user_instructions_with_fs( + async fn user_instructions_with_fs( &self, fs: &dyn ExecutorFileSystem, + startup_warnings: &mut Vec, ) -> Option { - let agents_md_docs = self.read_agents_md(fs).await; + let agents_md_docs = self.read_agents_md(fs, startup_warnings).await; let mut output = String::new(); @@ -130,11 +146,15 @@ impl<'a> AgentsMdManager<'a> { /// Returns all instruction source files included in the current config. pub async fn instruction_sources(&self, fs: &dyn ExecutorFileSystem) -> Vec { - let mut paths = - Self::load_global_instructions(LOCAL_FS.as_ref(), Some(&self.config.codex_home)) - .await - .map(|loaded| vec![loaded.path]) - .unwrap_or_default(); + let mut global_instruction_warnings = Vec::new(); + let mut paths = Self::load_global_instructions( + LOCAL_FS.as_ref(), + Some(&self.config.codex_home), + &mut global_instruction_warnings, + ) + .await + .map(|loaded| vec![loaded.path]) + .unwrap_or_default(); match self.agents_md_paths(fs).await { Ok(agents_md_paths) => paths.extend(agents_md_paths), Err(err) => { @@ -150,7 +170,11 @@ impl<'a> AgentsMdManager<'a> { /// concatenation of all discovered docs. If no documentation file is found /// the function returns `Ok(None)`. Unexpected I/O failures bubble up as /// `Err` so callers can decide how to handle them. - async fn read_agents_md(&self, fs: &dyn ExecutorFileSystem) -> io::Result> { + async fn read_agents_md( + &self, + fs: &dyn ExecutorFileSystem, + startup_warnings: &mut Vec, + ) -> io::Result> { let max_total = self.config.project_doc_max_bytes; if max_total == 0 { @@ -182,6 +206,8 @@ impl<'a> AgentsMdManager<'a> { Err(err) if err.kind() == io::ErrorKind::NotFound => continue, Err(err) => return Err(err), }; + warn_invalid_utf8(&p, &data, "Project", startup_warnings); + let size = data.len() as u64; if size > remaining { data.truncate(remaining as usize); @@ -324,6 +350,20 @@ impl<'a> AgentsMdManager<'a> { } } +fn warn_invalid_utf8( + path: &AbsolutePathBuf, + data: &[u8], + source: &str, + startup_warnings: &mut Vec, +) { + if let Err(err) = std::str::from_utf8(data) { + startup_warnings.push(format!( + "{source} AGENTS.md instructions from `{}` contain invalid UTF-8: {err}. Invalid byte sequences were replaced.", + path.display() + )); + } +} + #[cfg(test)] #[path = "agents_md_tests.rs"] mod tests; diff --git a/codex-rs/core/src/agents_md_tests.rs b/codex-rs/core/src/agents_md_tests.rs index a3a7544823..2d6a5fa30b 100644 --- a/codex-rs/core/src/agents_md_tests.rs +++ b/codex-rs/core/src/agents_md_tests.rs @@ -7,12 +7,14 @@ use core_test_support::PathBufExt; use core_test_support::TempDirExt; use pretty_assertions::assert_eq; use std::fs; +use std::path::Path; use std::path::PathBuf; use tempfile::TempDir; async fn get_user_instructions(config: &Config) -> Option { + let mut warnings = Vec::new(); AgentsMdManager::new(config) - .user_instructions_with_fs(LOCAL_FS.as_ref()) + .user_instructions_with_fs(LOCAL_FS.as_ref(), &mut warnings) .await } @@ -22,6 +24,19 @@ async fn agents_md_paths(config: &Config) -> std::io::Result