//! This module is responsible for parsing & validating a patch into a list of "hunks". //! (It does not attempt to actually check that the patch can be applied to the filesystem.) //! //! The official Lark grammar for the apply-patch format is: //! //! start: begin_patch hunk+ end_patch //! begin_patch: "*** Begin Patch" LF //! end_patch: "*** End Patch" LF? //! //! hunk: add_hunk | delete_hunk | update_hunk //! add_hunk: "*** Add File: " filename LF add_line+ //! delete_hunk: "*** Delete File: " filename LF //! update_hunk: "*** Update File: " filename LF change_move? change? //! filename: /(.+)/ //! add_line: "+" /(.+)/ LF -> line //! //! change_move: "*** Move to: " filename LF //! change: (change_context | change_line)+ eof_line? //! change_context: ("@@" | "@@ " /(.+)/) LF //! change_line: ("+" | "-" | " ") /(.+)/ LF //! eof_line: "*** End of File" LF //! //! The parser below is a little more lenient than the explicit spec and allows for //! leading/trailing whitespace around patch markers. use crate::ApplyPatchArgs; use codex_utils_absolute_path::AbsolutePathBuf; #[cfg(test)] use codex_utils_absolute_path::test_support::PathBufExt; use std::path::Path; use std::path::PathBuf; use thiserror::Error; const BEGIN_PATCH_MARKER: &str = "*** Begin Patch"; const END_PATCH_MARKER: &str = "*** End Patch"; const ADD_FILE_MARKER: &str = "*** Add File: "; const DELETE_FILE_MARKER: &str = "*** Delete File: "; const UPDATE_FILE_MARKER: &str = "*** Update File: "; const MOVE_TO_MARKER: &str = "*** Move to: "; const EOF_MARKER: &str = "*** End of File"; const CHANGE_CONTEXT_MARKER: &str = "@@ "; const EMPTY_CHANGE_CONTEXT_MARKER: &str = "@@"; /// Currently, the only OpenAI model that knowingly requires lenient parsing is /// gpt-4.1. While we could try to require everyone to pass in a strictness /// param when invoking apply_patch, it is a pain to thread it through all of /// the call sites, so we resign ourselves allowing lenient parsing for all /// models. See [`ParseMode::Lenient`] for details on the exceptions we make for /// gpt-4.1. const PARSE_IN_STRICT_MODE: bool = false; #[derive(Debug, PartialEq, Error, Clone)] pub enum ParseError { #[error("invalid patch: {0}")] InvalidPatchError(String), #[error("invalid hunk at line {line_number}, {message}")] InvalidHunkError { message: String, line_number: usize }, } use ParseError::*; #[derive(Debug, PartialEq, Clone)] #[allow(clippy::enum_variant_names)] pub enum Hunk { AddFile { path: PathBuf, contents: String, }, DeleteFile { path: PathBuf, }, UpdateFile { path: PathBuf, move_path: Option, /// Chunks should be in order, i.e. the `change_context` of one chunk /// should occur later in the file than the previous chunk. chunks: Vec, }, } impl Hunk { pub fn resolve_path(&self, cwd: &AbsolutePathBuf) -> AbsolutePathBuf { let path = match self { Hunk::UpdateFile { path, .. } => path, Hunk::AddFile { .. } | Hunk::DeleteFile { .. } => self.path(), }; AbsolutePathBuf::resolve_path_against_base(path, cwd) } /// Returns the path affected by this hunk, using the move destination for rename hunks. pub fn path(&self) -> &Path { match self { Hunk::AddFile { path, .. } => path, Hunk::DeleteFile { path } => path, Hunk::UpdateFile { move_path: Some(path), .. } => path, Hunk::UpdateFile { path, move_path: None, .. } => path, } } } use Hunk::*; #[derive(Debug, PartialEq, Clone)] pub struct UpdateFileChunk { /// A single line of context used to narrow down the position of the chunk /// (this is usually a class, method, or function definition.) pub change_context: Option, /// A contiguous block of lines that should be replaced with `new_lines`. /// `old_lines` must occur strictly after `change_context`. pub old_lines: Vec, pub new_lines: Vec, /// If set to true, `old_lines` must occur at the end of the source file. /// (Tolerance around trailing newlines should be encouraged.) pub is_end_of_file: bool, } pub fn parse_patch(patch: &str) -> Result { let mode = if PARSE_IN_STRICT_MODE { ParseMode::Strict } else { ParseMode::Lenient }; parse_patch_text(patch, mode) } /// Parses streamed patch text that may not have reached `*** End Patch` yet. /// /// This entry point is for progress reporting only; callers must not use its /// output to apply a patch. pub fn parse_patch_streaming(patch: &str) -> Result { parse_patch_text(patch, ParseMode::Streaming) } enum ParseMode { /// Parse the patch text argument as is. Strict, /// GPT-4.1 is known to formulate the `command` array for the `local_shell` /// tool call for `apply_patch` call using something like the following: /// /// ```json /// [ /// "apply_patch", /// "<<'EOF'\n*** Begin Patch\n*** Update File: README.md\n@@...\n*** End Patch\nEOF\n", /// ] /// ``` /// /// This is a problem because `local_shell` is a bit of a misnomer: the /// `command` is not invoked by passing the arguments to a shell like Bash, /// but are invoked using something akin to `execvpe(3)`. /// /// This is significant in this case because where a shell would interpret /// `<<'EOF'...` as a heredoc and pass the contents via stdin (which is /// fine, as `apply_patch` is specified to read from stdin if no argument is /// passed), `execvpe(3)` interprets the heredoc as a literal string. To get /// the `local_shell` tool to run a command the way shell would, the /// `command` array must be something like: /// /// ```json /// [ /// "bash", /// "-lc", /// "apply_patch <<'EOF'\n*** Begin Patch\n*** Update File: README.md\n@@...\n*** End Patch\nEOF\n", /// ] /// ``` /// /// In lenient mode, we check if the argument to `apply_patch` starts with /// `<<'EOF'` and ends with `EOF\n`. If so, we strip off these markers, /// trim() the result, and treat what is left as the patch text. Lenient, /// Parse partial patch text for progress reporting while the model is /// still streaming tool input. This mode requires a begin marker but does /// not require an end marker, and its output must not be used to apply a /// patch. Streaming, } fn parse_patch_text(patch: &str, mode: ParseMode) -> Result { let lines: Vec<&str> = patch.trim().lines().collect(); let (patch_lines, hunk_lines) = match mode { ParseMode::Strict => check_patch_boundaries_strict(&lines)?, ParseMode::Lenient => check_patch_boundaries_lenient(&lines)?, ParseMode::Streaming => check_patch_boundaries_streaming(&lines)?, }; let mut hunks: Vec = Vec::new(); let mut remaining_lines = hunk_lines; let mut line_number = 2; let allow_incomplete = matches!(mode, ParseMode::Streaming); while !remaining_lines.is_empty() { let (hunk, hunk_lines) = parse_one_hunk(remaining_lines, line_number, allow_incomplete)?; hunks.push(hunk); line_number += hunk_lines; remaining_lines = &remaining_lines[hunk_lines..] } let patch = patch_lines.join("\n"); Ok(ApplyPatchArgs { hunks, patch, workdir: None, }) } fn check_patch_boundaries_streaming<'a>( original_lines: &'a [&'a str], ) -> Result<(&'a [&'a str], &'a [&'a str]), ParseError> { match original_lines { [first, ..] if first.trim() == BEGIN_PATCH_MARKER => { let body_lines = if original_lines .last() .is_some_and(|line| line.trim() == END_PATCH_MARKER) { &original_lines[1..original_lines.len() - 1] } else { &original_lines[1..] }; Ok((original_lines, body_lines)) } _ => check_patch_boundaries_strict(original_lines), } } /// Checks the start and end lines of the patch text for `apply_patch`, /// returning an error if they do not match the expected markers. fn check_patch_boundaries_strict<'a>( lines: &'a [&'a str], ) -> Result<(&'a [&'a str], &'a [&'a str]), ParseError> { let (first_line, last_line) = match lines { [] => (None, None), [first] => (Some(first), Some(first)), [first, .., last] => (Some(first), Some(last)), }; check_start_and_end_lines_strict(first_line, last_line)?; Ok((lines, &lines[1..lines.len() - 1])) } /// If we are in lenient mode, we check if the first line starts with `<( original_lines: &'a [&'a str], ) -> Result<(&'a [&'a str], &'a [&'a str]), ParseError> { let original_parse_error = match check_patch_boundaries_strict(original_lines) { Ok(lines) => return Ok(lines), Err(e) => e, }; match original_lines { [first, .., last] => { if (first == &"<= 4 { let inner_lines = &original_lines[1..original_lines.len() - 1]; check_patch_boundaries_strict(inner_lines) } else { Err(original_parse_error) } } _ => Err(original_parse_error), } } fn check_start_and_end_lines_strict( first_line: Option<&&str>, last_line: Option<&&str>, ) -> Result<(), ParseError> { let first_line = first_line.map(|line| line.trim()); let last_line = last_line.map(|line| line.trim()); match (first_line, last_line) { (Some(first), Some(last)) if first == BEGIN_PATCH_MARKER && last == END_PATCH_MARKER => { Ok(()) } (Some(first), _) if first != BEGIN_PATCH_MARKER => Err(InvalidPatchError(String::from( "The first line of the patch must be '*** Begin Patch'", ))), _ => Err(InvalidPatchError(String::from( "The last line of the patch must be '*** End Patch'", ))), } } /// Attempts to parse a single hunk from the start of lines. /// Returns the parsed hunk and the number of lines parsed (or a ParseError). fn parse_one_hunk( lines: &[&str], line_number: usize, allow_incomplete: bool, ) -> Result<(Hunk, usize), ParseError> { // Be tolerant of case mismatches and extra padding around marker strings. let first_line = lines[0].trim(); if let Some(path) = first_line.strip_prefix(ADD_FILE_MARKER) { // Add File let mut contents = String::new(); let mut parsed_lines = 1; for add_line in &lines[1..] { if let Some(line_to_add) = add_line.strip_prefix('+') { contents.push_str(line_to_add); contents.push('\n'); parsed_lines += 1; } else { break; } } return Ok(( AddFile { path: PathBuf::from(path), contents, }, parsed_lines, )); } else if let Some(path) = first_line.strip_prefix(DELETE_FILE_MARKER) { // Delete File return Ok(( DeleteFile { path: PathBuf::from(path), }, 1, )); } else if let Some(path) = first_line.strip_prefix(UPDATE_FILE_MARKER) { // Update File let mut remaining_lines = &lines[1..]; let mut parsed_lines = 1; // Optional: move file line let move_path = remaining_lines .first() .and_then(|x| x.strip_prefix(MOVE_TO_MARKER)); if move_path.is_some() { remaining_lines = &remaining_lines[1..]; parsed_lines += 1; } let mut chunks = Vec::new(); // NOTE: we need to know to stop once we reach the next special marker header. while !remaining_lines.is_empty() { // Skip over any completely blank lines that may separate chunks. if remaining_lines[0].trim().is_empty() { parsed_lines += 1; remaining_lines = &remaining_lines[1..]; continue; } if remaining_lines[0].starts_with('*') { break; } if allow_incomplete && remaining_lines[0] == "@" { break; } let parsed_chunk = parse_update_file_chunk( remaining_lines, line_number + parsed_lines, chunks.is_empty(), ); let (chunk, chunk_lines) = match parsed_chunk { Ok(parsed) => parsed, Err(InvalidHunkError { .. }) if allow_incomplete && !chunks.is_empty() => { break; } Err(err) => return Err(err), }; chunks.push(chunk); parsed_lines += chunk_lines; remaining_lines = &remaining_lines[chunk_lines..] } if chunks.is_empty() { return Err(InvalidHunkError { message: format!("Update file hunk for path '{path}' is empty"), line_number, }); } return Ok(( UpdateFile { path: PathBuf::from(path), move_path: move_path.map(PathBuf::from), chunks, }, parsed_lines, )); } Err(InvalidHunkError { message: format!( "'{first_line}' is not a valid hunk header. Valid hunk headers: '*** Add File: {{path}}', '*** Delete File: {{path}}', '*** Update File: {{path}}'" ), line_number, }) } fn parse_update_file_chunk( lines: &[&str], line_number: usize, allow_missing_context: bool, ) -> Result<(UpdateFileChunk, usize), ParseError> { if lines.is_empty() { return Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number, }); } // If we see an explicit context marker @@ or @@ , consume it; otherwise, optionally // allow treating the chunk as starting directly with diff lines. let (change_context, start_index) = if lines[0] == EMPTY_CHANGE_CONTEXT_MARKER { (None, 1) } else if let Some(context) = lines[0].strip_prefix(CHANGE_CONTEXT_MARKER) { (Some(context.to_string()), 1) } else { if !allow_missing_context { return Err(InvalidHunkError { message: format!( "Expected update hunk to start with a @@ context marker, got: '{}'", lines[0] ), line_number, }); } (None, 0) }; if start_index >= lines.len() { return Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number: line_number + 1, }); } let mut chunk = UpdateFileChunk { change_context, old_lines: Vec::new(), new_lines: Vec::new(), is_end_of_file: false, }; let mut parsed_lines = 0; for line in &lines[start_index..] { match *line { EOF_MARKER => { if parsed_lines == 0 { return Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number: line_number + 1, }); } chunk.is_end_of_file = true; parsed_lines += 1; break; } line_contents => { match line_contents.chars().next() { None => { // Interpret this as an empty line. chunk.old_lines.push(String::new()); chunk.new_lines.push(String::new()); } Some(' ') => { chunk.old_lines.push(line_contents[1..].to_string()); chunk.new_lines.push(line_contents[1..].to_string()); } Some('+') => { chunk.new_lines.push(line_contents[1..].to_string()); } Some('-') => { chunk.old_lines.push(line_contents[1..].to_string()); } _ => { if parsed_lines == 0 { return Err(InvalidHunkError { message: format!( "Unexpected line found in update hunk: '{line_contents}'. Every line should start with ' ' (context line), '+' (added line), or '-' (removed line)" ), line_number: line_number + 1, }); } // Assume this is the start of the next hunk. break; } } parsed_lines += 1; } } } Ok((chunk, parsed_lines + start_index)) } #[test] fn test_parse_patch_streaming() { assert_eq!( parse_patch_streaming("*** Begin Patch\n*** Add File: src/hello.txt\n+hello\n+wor"), Ok(ApplyPatchArgs { hunks: vec![AddFile { path: PathBuf::from("src/hello.txt"), contents: "hello\nwor\n".to_string(), }], patch: "*** Begin Patch\n*** Add File: src/hello.txt\n+hello\n+wor".to_string(), workdir: None, }) ); assert_eq!( parse_patch_streaming( "*** Begin Patch\n*** Update File: src/old.rs\n*** Move to: src/new.rs\n@@\n-old\n+new", ), Ok(ApplyPatchArgs { hunks: vec![UpdateFile { path: PathBuf::from("src/old.rs"), move_path: Some(PathBuf::from("src/new.rs")), chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["old".to_string()], new_lines: vec!["new".to_string()], is_end_of_file: false, }], }], patch: "*** Begin Patch\n*** Update File: src/old.rs\n*** Move to: src/new.rs\n@@\n-old\n+new".to_string(), workdir: None, }) ); assert!( parse_patch_text( "*** Begin Patch\n*** Delete File: gone.txt", ParseMode::Streaming ) .is_ok() ); assert!( parse_patch_text( "*** Begin Patch\n*** Delete File: gone.txt", ParseMode::Strict ) .is_err() ); assert_eq!( parse_patch_streaming( "*** Begin Patch\n*** Add File: src/one.txt\n+one\n*** Delete File: src/two.txt\n", ), Ok(ApplyPatchArgs { hunks: vec![ AddFile { path: PathBuf::from("src/one.txt"), contents: "one\n".to_string(), }, DeleteFile { path: PathBuf::from("src/two.txt"), }, ], patch: "*** Begin Patch\n*** Add File: src/one.txt\n+one\n*** Delete File: src/two.txt" .to_string(), workdir: None, }) ); } #[test] fn test_parse_patch_streaming_large_patch_by_character() { let patch = "\ *** Begin Patch *** Add File: docs/release-notes.md +# Release notes + +## CLI +- Surface apply_patch progress while arguments stream. +- Keep final patch application gated on the completed tool call. +- Include file summaries in the progress event payload. *** Update File: src/config.rs @@ impl Config - pub apply_patch_progress: bool, + pub stream_apply_patch_progress: bool, pub include_diagnostics: bool, @@ fn default_progress_interval() - Duration::from_millis(500) + Duration::from_millis(250) *** Delete File: src/legacy_patch_progress.rs *** Update File: crates/cli/src/main.rs *** Move to: crates/cli/src/bin/codex.rs @@ fn run() - let args = Args::parse(); - dispatch(args) + let cli = Cli::parse(); + dispatch(cli) *** Add File: tests/fixtures/apply_patch_progress.json +{ + \"type\": \"apply_patch_progress\", + \"hunks\": [ + { \"operation\": \"add\", \"path\": \"docs/release-notes.md\" }, + { \"operation\": \"update\", \"path\": \"src/config.rs\" } + ] +} *** Update File: README.md @@ Development workflow Build the Rust workspace before opening a pull request. +When touching streamed tool calls, include parser coverage for partial input. +Prefer tests that exercise the exact event payload shape. *** Delete File: docs/old-apply-patch-progress.md *** End Patch"; let mut max_hunk_count = 0; let mut saw_hunk_counts = Vec::new(); for i in 1..=patch.len() { let partial = &patch[..i]; if let Ok(parsed) = parse_patch_streaming(partial) { let hunk_count = parsed.hunks.len(); assert!( hunk_count >= max_hunk_count, "hunk count should never decrease while streaming: {hunk_count} < {max_hunk_count} for {partial:?}", ); if hunk_count > max_hunk_count { saw_hunk_counts.push(hunk_count); max_hunk_count = hunk_count; } } } assert_eq!(saw_hunk_counts, vec![1, 2, 3, 4, 5, 6, 7]); let parsed = parse_patch_streaming(patch).unwrap(); assert_eq!(parsed.hunks.len(), 7); assert_eq!( parsed .hunks .iter() .map(|hunk| match hunk { AddFile { .. } => "add", DeleteFile { .. } => "delete", UpdateFile { move_path: Some(_), .. } => "move-update", UpdateFile { move_path: None, .. } => "update", }) .collect::>(), vec![ "add", "update", "delete", "move-update", "add", "update", "delete" ] ); } #[test] fn test_parse_patch() { assert_eq!( parse_patch_text("bad", ParseMode::Strict), Err(InvalidPatchError( "The first line of the patch must be '*** Begin Patch'".to_string() )) ); assert_eq!( parse_patch_text("*** Begin Patch\nbad", ParseMode::Strict), Err(InvalidPatchError( "The last line of the patch must be '*** End Patch'".to_string() )) ); assert_eq!( parse_patch_text( concat!( "*** Begin Patch", " ", "\n*** Add File: foo\n+hi\n", " ", "*** End Patch" ), ParseMode::Strict ) .unwrap() .hunks, vec![AddFile { path: PathBuf::from("foo"), contents: "hi\n".to_string() }] ); assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** Update File: test.py\n\ *** End Patch", ParseMode::Strict ), Err(InvalidHunkError { message: "Update file hunk for path 'test.py' is empty".to_string(), line_number: 2, }) ); assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** End Patch", ParseMode::Strict ) .unwrap() .hunks, Vec::new() ); assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** Add File: path/add.py\n\ +abc\n\ +def\n\ *** Delete File: path/delete.py\n\ *** Update File: path/update.py\n\ *** Move to: path/update2.py\n\ @@ def f():\n\ - pass\n\ + return 123\n\ *** End Patch", ParseMode::Strict ) .unwrap() .hunks, vec![ AddFile { path: PathBuf::from("path/add.py"), contents: "abc\ndef\n".to_string() }, DeleteFile { path: PathBuf::from("path/delete.py") }, UpdateFile { path: PathBuf::from("path/update.py"), move_path: Some(PathBuf::from("path/update2.py")), chunks: vec![UpdateFileChunk { change_context: Some("def f():".to_string()), old_lines: vec![" pass".to_string()], new_lines: vec![" return 123".to_string()], is_end_of_file: false }] } ] ); // Update hunk followed by another hunk (Add File). assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** Update File: file.py\n\ @@\n\ +line\n\ *** Add File: other.py\n\ +content\n\ *** End Patch", ParseMode::Strict ) .unwrap() .hunks, vec![ UpdateFile { path: PathBuf::from("file.py"), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec![], new_lines: vec!["line".to_string()], is_end_of_file: false }], }, AddFile { path: PathBuf::from("other.py"), contents: "content\n".to_string() } ] ); // Update hunk without an explicit @@ header for the first chunk should parse. // Use a raw string to preserve the leading space diff marker on the context line. assert_eq!( parse_patch_text( r#"*** Begin Patch *** Update File: file2.py import foo +bar *** End Patch"#, ParseMode::Strict ) .unwrap() .hunks, vec![UpdateFile { path: PathBuf::from("file2.py"), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["import foo".to_string()], new_lines: vec!["import foo".to_string(), "bar".to_string()], is_end_of_file: false, }], }] ); } #[test] fn test_parse_patch_accepts_relative_and_absolute_hunk_paths() { let dir = tempfile::tempdir().unwrap(); let absolute_delete = dir.path().join("absolute-delete.py").abs(); let absolute_update = dir.path().join("absolute-update.py").abs(); let patch_text = format!( r#"*** Begin Patch *** Add File: relative-add.py +content *** Delete File: {} *** Update File: {} @@ -old +new *** End Patch"#, absolute_delete.display(), absolute_update.display() ); assert_eq!( parse_patch_text(&patch_text, ParseMode::Strict) .unwrap() .hunks, vec![ AddFile { path: PathBuf::from("relative-add.py"), contents: "content\n".to_string() }, DeleteFile { path: absolute_delete.to_path_buf() }, UpdateFile { path: absolute_update.to_path_buf(), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["old".to_string()], new_lines: vec!["new".to_string()], is_end_of_file: false }] }, ] ); } #[test] fn test_hunk_resolve_path_accepts_relative_and_absolute_paths() { let cwd_dir = tempfile::tempdir().unwrap(); let cwd = cwd_dir.path().to_path_buf().abs(); let absolute_dir = tempfile::tempdir().unwrap(); let absolute_add = absolute_dir.path().join("absolute-add.py").abs(); let absolute_delete = absolute_dir.path().join("absolute-delete.py").abs(); let absolute_update = absolute_dir.path().join("absolute-update.py").abs(); for (hunk, expected_path) in [ ( AddFile { path: PathBuf::from("relative-add.py"), contents: String::new(), }, cwd.join("relative-add.py"), ), ( DeleteFile { path: PathBuf::from("relative-delete.py"), }, cwd.join("relative-delete.py"), ), ( UpdateFile { path: PathBuf::from("relative-update.py"), move_path: None, chunks: Vec::new(), }, cwd.join("relative-update.py"), ), ( AddFile { path: absolute_add.to_path_buf(), contents: String::new(), }, absolute_add, ), ( DeleteFile { path: absolute_delete.to_path_buf(), }, absolute_delete, ), ( UpdateFile { path: absolute_update.to_path_buf(), move_path: None, chunks: Vec::new(), }, absolute_update, ), ] { assert_eq!(hunk.resolve_path(&cwd), expected_path); } } #[test] fn test_parse_patch_lenient() { let patch_text = r#"*** Begin Patch *** Update File: file2.py import foo +bar *** End Patch"#; let expected_patch = vec![UpdateFile { path: PathBuf::from("file2.py"), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["import foo".to_string()], new_lines: vec!["import foo".to_string(), "bar".to_string()], is_end_of_file: false, }], }]; let expected_error = InvalidPatchError("The first line of the patch must be '*** Begin Patch'".to_string()); let patch_text_in_heredoc = format!("<