//! This module is responsible for parsing & validating a patch into a list of "hunks". //! (It does not attempt to actually check that the patch can be applied to the filesystem.) //! //! The official Lark grammar for the apply-patch format is: //! //! start: begin_patch environment_id? hunk+ end_patch //! begin_patch: "*** Begin Patch" LF //! environment_id: "*** Environment ID: " filename LF //! end_patch: "*** End Patch" LF? //! //! hunk: add_hunk | delete_hunk | update_hunk //! add_hunk: "*** Add File: " filename LF add_line+ //! delete_hunk: "*** Delete File: " filename LF //! update_hunk: "*** Update File: " filename LF change_move? change? //! filename: /(.+)/ //! add_line: "+" /(.+)/ LF -> line //! //! change_move: "*** Move to: " filename LF //! change: (change_context | change_line)+ eof_line? //! change_context: ("@@" | "@@ " /(.+)/) LF //! change_line: ("+" | "-" | " ") /(.+)/ LF //! eof_line: "*** End of File" LF //! //! The parser below is a little more lenient than the explicit spec and allows for //! leading/trailing whitespace around patch markers. use crate::ApplyPatchArgs; use codex_utils_absolute_path::AbsolutePathBuf; #[cfg(test)] use codex_utils_absolute_path::test_support::PathBufExt; use std::path::Path; use std::path::PathBuf; use thiserror::Error; pub(crate) const BEGIN_PATCH_MARKER: &str = "*** Begin Patch"; pub(crate) const ENVIRONMENT_ID_MARKER: &str = "*** Environment ID: "; pub(crate) const END_PATCH_MARKER: &str = "*** End Patch"; pub(crate) const ADD_FILE_MARKER: &str = "*** Add File: "; pub(crate) const DELETE_FILE_MARKER: &str = "*** Delete File: "; pub(crate) const UPDATE_FILE_MARKER: &str = "*** Update File: "; pub(crate) const MOVE_TO_MARKER: &str = "*** Move to: "; pub(crate) const EOF_MARKER: &str = "*** End of File"; pub(crate) const CHANGE_CONTEXT_MARKER: &str = "@@ "; pub(crate) const EMPTY_CHANGE_CONTEXT_MARKER: &str = "@@"; /// Currently, the only OpenAI model that knowingly requires lenient parsing is /// gpt-4.1. While we could try to require everyone to pass in a strictness /// param when invoking apply_patch, it is a pain to thread it through all of /// the call sites, so we resign ourselves allowing lenient parsing for all /// models. See [`ParseMode::Lenient`] for details on the exceptions we make for /// gpt-4.1. const PARSE_IN_STRICT_MODE: bool = false; #[derive(Debug, PartialEq, Error, Clone)] pub enum ParseError { #[error("invalid patch: {0}")] InvalidPatchError(String), #[error("invalid hunk at line {line_number}, {message}")] InvalidHunkError { message: String, line_number: usize }, } use ParseError::*; #[derive(Debug, PartialEq, Clone)] #[allow(clippy::enum_variant_names)] pub enum Hunk { AddFile { path: PathBuf, contents: String, }, DeleteFile { path: PathBuf, }, UpdateFile { path: PathBuf, move_path: Option, /// Chunks should be in order, i.e. the `change_context` of one chunk /// should occur later in the file than the previous chunk. chunks: Vec, }, } impl Hunk { pub fn resolve_path(&self, cwd: &AbsolutePathBuf) -> AbsolutePathBuf { let path = match self { Hunk::UpdateFile { path, .. } => path, Hunk::AddFile { .. } | Hunk::DeleteFile { .. } => self.path(), }; AbsolutePathBuf::resolve_path_against_base(path, cwd) } /// Returns the path affected by this hunk, using the move destination for rename hunks. pub fn path(&self) -> &Path { match self { Hunk::AddFile { path, .. } => path, Hunk::DeleteFile { path } => path, Hunk::UpdateFile { move_path: Some(path), .. } => path, Hunk::UpdateFile { path, move_path: None, .. } => path, } } } use Hunk::*; #[derive(Debug, PartialEq, Clone)] pub struct UpdateFileChunk { /// A single line of context used to narrow down the position of the chunk /// (this is usually a class, method, or function definition.) pub change_context: Option, /// A contiguous block of lines that should be replaced with `new_lines`. /// `old_lines` must occur strictly after `change_context`. pub old_lines: Vec, pub new_lines: Vec, /// If set to true, `old_lines` must occur at the end of the source file. /// (Tolerance around trailing newlines should be encouraged.) pub is_end_of_file: bool, } pub fn parse_patch(patch: &str) -> Result { let mode = if PARSE_IN_STRICT_MODE { ParseMode::Strict } else { ParseMode::Lenient }; parse_patch_text(patch, mode) } enum ParseMode { /// Parse the patch text argument as is. Strict, /// GPT-4.1 is known to formulate the `command` array for the `local_shell` /// tool call for `apply_patch` call using something like the following: /// /// ```json /// [ /// "apply_patch", /// "<<'EOF'\n*** Begin Patch\n*** Update File: README.md\n@@...\n*** End Patch\nEOF\n", /// ] /// ``` /// /// This is a problem because `local_shell` is a bit of a misnomer: the /// `command` is not invoked by passing the arguments to a shell like Bash, /// but are invoked using something akin to `execvpe(3)`. /// /// This is significant in this case because where a shell would interpret /// `<<'EOF'...` as a heredoc and pass the contents via stdin (which is /// fine, as `apply_patch` is specified to read from stdin if no argument is /// passed), `execvpe(3)` interprets the heredoc as a literal string. To get /// the `local_shell` tool to run a command the way shell would, the /// `command` array must be something like: /// /// ```json /// [ /// "bash", /// "-lc", /// "apply_patch <<'EOF'\n*** Begin Patch\n*** Update File: README.md\n@@...\n*** End Patch\nEOF\n", /// ] /// ``` /// /// In lenient mode, we check if the argument to `apply_patch` starts with /// `<<'EOF'` and ends with `EOF\n`. If so, we strip off these markers, /// trim() the result, and treat what is left as the patch text. Lenient, } fn parse_patch_text(patch: &str, mode: ParseMode) -> Result { let lines: Vec<&str> = patch.trim().lines().collect(); let (patch_lines, hunk_lines) = match mode { ParseMode::Strict => check_patch_boundaries_strict(&lines)?, ParseMode::Lenient => check_patch_boundaries_lenient(&lines)?, }; let (environment_id, mut remaining_lines, mut line_number) = parse_environment_id_preamble(hunk_lines)?; let mut hunks: Vec = Vec::new(); while !remaining_lines.is_empty() { let (hunk, hunk_lines) = parse_one_hunk(remaining_lines, line_number)?; hunks.push(hunk); line_number += hunk_lines; remaining_lines = &remaining_lines[hunk_lines..] } let patch = patch_lines.join("\n"); Ok(ApplyPatchArgs { hunks, patch, workdir: None, environment_id, }) } fn parse_environment_id_preamble<'a>( hunk_lines: &'a [&'a str], ) -> Result<(Option, &'a [&'a str], usize), ParseError> { let Some(first_line) = hunk_lines.first() else { return Ok((None, hunk_lines, 2)); }; let Some(environment_id) = first_line.trim_start().strip_prefix(ENVIRONMENT_ID_MARKER) else { return Ok((None, hunk_lines, 2)); }; let environment_id = environment_id.trim(); if environment_id.is_empty() { return Err(InvalidPatchError( "apply_patch environment_id cannot be empty".to_string(), )); } Ok((Some(environment_id.to_string()), &hunk_lines[1..], 3)) } /// Checks the start and end lines of the patch text for `apply_patch`, /// returning an error if they do not match the expected markers. fn check_patch_boundaries_strict<'a>( lines: &'a [&'a str], ) -> Result<(&'a [&'a str], &'a [&'a str]), ParseError> { let (first_line, last_line) = match lines { [] => (None, None), [first] => (Some(first), Some(first)), [first, .., last] => (Some(first), Some(last)), }; check_start_and_end_lines_strict(first_line, last_line)?; Ok((lines, &lines[1..lines.len() - 1])) } /// If we are in lenient mode, we check if the first line starts with `<( original_lines: &'a [&'a str], ) -> Result<(&'a [&'a str], &'a [&'a str]), ParseError> { let original_parse_error = match check_patch_boundaries_strict(original_lines) { Ok(lines) => return Ok(lines), Err(e) => e, }; match original_lines { [first, .., last] => { if (first == &"<= 4 { let inner_lines = &original_lines[1..original_lines.len() - 1]; check_patch_boundaries_strict(inner_lines) } else { Err(original_parse_error) } } _ => Err(original_parse_error), } } fn check_start_and_end_lines_strict( first_line: Option<&&str>, last_line: Option<&&str>, ) -> Result<(), ParseError> { let first_line = first_line.map(|line| line.trim()); let last_line = last_line.map(|line| line.trim()); match (first_line, last_line) { (Some(first), Some(last)) if first == BEGIN_PATCH_MARKER && last == END_PATCH_MARKER => { Ok(()) } (Some(first), _) if first != BEGIN_PATCH_MARKER => Err(InvalidPatchError(String::from( "The first line of the patch must be '*** Begin Patch'", ))), _ => Err(InvalidPatchError(String::from( "The last line of the patch must be '*** End Patch'", ))), } } /// Attempts to parse a single hunk from the start of lines. /// Returns the parsed hunk and the number of lines parsed (or a ParseError). fn parse_one_hunk(lines: &[&str], line_number: usize) -> Result<(Hunk, usize), ParseError> { let first_line = lines[0].trim(); if let Some(path) = first_line.strip_prefix(ADD_FILE_MARKER) { let mut contents = String::new(); let mut parsed_lines = 1; for add_line in &lines[1..] { if let Some(line_to_add) = add_line.strip_prefix('+') { contents.push_str(line_to_add); contents.push('\n'); parsed_lines += 1; } else { break; } } return Ok(( AddFile { path: PathBuf::from(path), contents, }, parsed_lines, )); } else if let Some(path) = first_line.strip_prefix(DELETE_FILE_MARKER) { return Ok(( DeleteFile { path: PathBuf::from(path), }, 1, )); } else if let Some(path) = first_line.strip_prefix(UPDATE_FILE_MARKER) { let mut remaining_lines = &lines[1..]; let mut parsed_lines = 1; let move_path = remaining_lines .first() .and_then(|x| x.strip_prefix(MOVE_TO_MARKER)); if move_path.is_some() { remaining_lines = &remaining_lines[1..]; parsed_lines += 1; } let mut chunks = Vec::new(); while !remaining_lines.is_empty() { if remaining_lines[0].trim().is_empty() { parsed_lines += 1; remaining_lines = &remaining_lines[1..]; continue; } if remaining_lines[0].starts_with('*') { break; } let (chunk, chunk_lines) = parse_update_file_chunk( remaining_lines, line_number + parsed_lines, chunks.is_empty(), )?; chunks.push(chunk); parsed_lines += chunk_lines; remaining_lines = &remaining_lines[chunk_lines..] } if chunks.is_empty() { return Err(InvalidHunkError { message: format!( "Update file hunk for path '{}' is empty", Path::new(path).display() ), line_number, }); } return Ok(( UpdateFile { path: PathBuf::from(path), move_path: move_path.map(PathBuf::from), chunks, }, parsed_lines, )); } Err(InvalidHunkError { message: format!( "'{first_line}' is not a valid hunk header. Valid hunk headers: '*** Add File: {{path}}', '*** Delete File: {{path}}', '*** Update File: {{path}}'" ), line_number, }) } fn parse_update_file_chunk( lines: &[&str], line_number: usize, allow_missing_context: bool, ) -> Result<(UpdateFileChunk, usize), ParseError> { if lines.is_empty() { return Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number, }); } let (change_context, start_index) = if lines[0] == EMPTY_CHANGE_CONTEXT_MARKER { (None, 1) } else if let Some(context) = lines[0].strip_prefix(CHANGE_CONTEXT_MARKER) { (Some(context.to_string()), 1) } else { if !allow_missing_context { return Err(InvalidHunkError { message: format!( "Expected update hunk to start with a @@ context marker, got: '{}'", lines[0] ), line_number, }); } (None, 0) }; if start_index >= lines.len() { return Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number: line_number + 1, }); } let mut chunk = UpdateFileChunk { change_context, old_lines: Vec::new(), new_lines: Vec::new(), is_end_of_file: false, }; let mut parsed_lines = 0; for line in &lines[start_index..] { match *line { EOF_MARKER => { if parsed_lines == 0 { return Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number: line_number + 1, }); } chunk.is_end_of_file = true; parsed_lines += 1; break; } line_contents => { match line_contents.chars().next() { None => { // Interpret this as an empty line. chunk.old_lines.push(String::new()); chunk.new_lines.push(String::new()); } Some(' ') => { chunk.old_lines.push(line_contents[1..].to_string()); chunk.new_lines.push(line_contents[1..].to_string()); } Some('+') => { chunk.new_lines.push(line_contents[1..].to_string()); } Some('-') => { chunk.old_lines.push(line_contents[1..].to_string()); } _ => { if parsed_lines == 0 { return Err(InvalidHunkError { message: format!( "Unexpected line found in update hunk: '{line_contents}'. Every line should start with ' ' (context line), '+' (added line), or '-' (removed line)" ), line_number: line_number + 1, }); } // Assume this is the start of the next hunk. break; } } parsed_lines += 1; } } } Ok((chunk, parsed_lines + start_index)) } #[test] fn test_parse_one_hunk() { assert_eq!( parse_one_hunk(&["bad"], /*line_number*/ 234), Err(InvalidHunkError { message: "'bad' is not a valid hunk header. \ Valid hunk headers: '*** Add File: {path}', '*** Delete File: {path}', '*** Update File: {path}'".to_string(), line_number: 234 }) ); } #[test] fn test_update_file_chunk() { assert_eq!( parse_update_file_chunk( &["bad"], /*line_number*/ 123, /*allow_missing_context*/ false, ), Err(InvalidHunkError { message: "Expected update hunk to start with a @@ context marker, got: 'bad'" .to_string(), line_number: 123 }) ); assert_eq!( parse_update_file_chunk( &["@@"], /*line_number*/ 123, /*allow_missing_context*/ false, ), Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number: 124 }) ); assert_eq!( parse_update_file_chunk( &["@@", "bad"], /*line_number*/ 123, /*allow_missing_context*/ false, ), Err(InvalidHunkError { message: "Unexpected line found in update hunk: 'bad'. Every line should start with ' ' (context line), '+' (added line), or '-' (removed line)".to_string(), line_number: 124 }) ); assert_eq!( parse_update_file_chunk( &["@@", "*** End of File"], /*line_number*/ 123, /*allow_missing_context*/ false, ), Err(InvalidHunkError { message: "Update hunk does not contain any lines".to_string(), line_number: 124 }) ); assert_eq!( parse_update_file_chunk( &[ "@@ change_context", "", " context", "-remove", "+add", " context2", "*** End Patch", ], /*line_number*/ 123, /*allow_missing_context*/ false, ), Ok(( UpdateFileChunk { change_context: Some("change_context".to_string()), old_lines: vec![ String::new(), "context".to_string(), "remove".to_string(), "context2".to_string(), ], new_lines: vec![ String::new(), "context".to_string(), "add".to_string(), "context2".to_string(), ], is_end_of_file: false, }, 6, )) ); assert_eq!( parse_update_file_chunk( &["@@", "+line", "*** End of File"], /*line_number*/ 123, /*allow_missing_context*/ false, ), Ok(( UpdateFileChunk { change_context: None, old_lines: Vec::new(), new_lines: vec!["line".to_string()], is_end_of_file: true, }, 3, )) ); } #[test] fn test_parse_patch() { assert_eq!( parse_patch_text("bad", ParseMode::Strict), Err(InvalidPatchError( "The first line of the patch must be '*** Begin Patch'".to_string() )) ); assert_eq!( parse_patch_text("*** Begin Patch\nbad", ParseMode::Strict), Err(InvalidPatchError( "The last line of the patch must be '*** End Patch'".to_string() )) ); assert_eq!( parse_patch_text( concat!( "*** Begin Patch", " ", "\n*** Add File: foo\n+hi\n", " ", "*** End Patch" ), ParseMode::Strict ) .unwrap() .hunks, vec![AddFile { path: PathBuf::from("foo"), contents: "hi\n".to_string() }] ); assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** Update File: test.py\n\ *** End Patch", ParseMode::Strict ), Err(InvalidHunkError { message: "Update file hunk for path 'test.py' is empty".to_string(), line_number: 2, }) ); assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** End Patch", ParseMode::Strict ) .unwrap() .hunks, Vec::new() ); assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** Add File: path/add.py\n\ +abc\n\ +def\n\ *** Delete File: path/delete.py\n\ *** Update File: path/update.py\n\ *** Move to: path/update2.py\n\ @@ def f():\n\ - pass\n\ + return 123\n\ *** End Patch", ParseMode::Strict ) .unwrap() .hunks, vec![ AddFile { path: PathBuf::from("path/add.py"), contents: "abc\ndef\n".to_string() }, DeleteFile { path: PathBuf::from("path/delete.py") }, UpdateFile { path: PathBuf::from("path/update.py"), move_path: Some(PathBuf::from("path/update2.py")), chunks: vec![UpdateFileChunk { change_context: Some("def f():".to_string()), old_lines: vec![" pass".to_string()], new_lines: vec![" return 123".to_string()], is_end_of_file: false }] } ] ); // Update hunk followed by another hunk (Add File). assert_eq!( parse_patch_text( "*** Begin Patch\n\ *** Update File: file.py\n\ @@\n\ +line\n\ *** Add File: other.py\n\ +content\n\ *** End Patch", ParseMode::Strict ) .unwrap() .hunks, vec![ UpdateFile { path: PathBuf::from("file.py"), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec![], new_lines: vec!["line".to_string()], is_end_of_file: false }], }, AddFile { path: PathBuf::from("other.py"), contents: "content\n".to_string() } ] ); // Update hunk without an explicit @@ header for the first chunk should parse. // Use a raw string to preserve the leading space diff marker on the context line. assert_eq!( parse_patch_text( r#"*** Begin Patch *** Update File: file2.py import foo +bar *** End Patch"#, ParseMode::Strict ) .unwrap() .hunks, vec![UpdateFile { path: PathBuf::from("file2.py"), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["import foo".to_string()], new_lines: vec!["import foo".to_string(), "bar".to_string()], is_end_of_file: false, }], }] ); } #[test] fn test_parse_patch_accepts_relative_and_absolute_hunk_paths() { let dir = tempfile::tempdir().unwrap(); let absolute_delete = dir.path().join("absolute-delete.py").abs(); let absolute_update = dir.path().join("absolute-update.py").abs(); let patch_text = format!( r#"*** Begin Patch *** Add File: relative-add.py +content *** Delete File: {} *** Update File: {} @@ -old +new *** End Patch"#, absolute_delete.display(), absolute_update.display() ); assert_eq!( parse_patch_text(&patch_text, ParseMode::Strict) .unwrap() .hunks, vec![ AddFile { path: PathBuf::from("relative-add.py"), contents: "content\n".to_string() }, DeleteFile { path: absolute_delete.to_path_buf() }, UpdateFile { path: absolute_update.to_path_buf(), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["old".to_string()], new_lines: vec!["new".to_string()], is_end_of_file: false }] }, ] ); } #[test] fn test_hunk_resolve_path_accepts_relative_and_absolute_paths() { let cwd_dir = tempfile::tempdir().unwrap(); let cwd = cwd_dir.path().to_path_buf().abs(); let absolute_dir = tempfile::tempdir().unwrap(); let absolute_add = absolute_dir.path().join("absolute-add.py").abs(); let absolute_delete = absolute_dir.path().join("absolute-delete.py").abs(); let absolute_update = absolute_dir.path().join("absolute-update.py").abs(); for (hunk, expected_path) in [ ( AddFile { path: PathBuf::from("relative-add.py"), contents: String::new(), }, cwd.join("relative-add.py"), ), ( DeleteFile { path: PathBuf::from("relative-delete.py"), }, cwd.join("relative-delete.py"), ), ( UpdateFile { path: PathBuf::from("relative-update.py"), move_path: None, chunks: Vec::new(), }, cwd.join("relative-update.py"), ), ( AddFile { path: absolute_add.to_path_buf(), contents: String::new(), }, absolute_add, ), ( DeleteFile { path: absolute_delete.to_path_buf(), }, absolute_delete, ), ( UpdateFile { path: absolute_update.to_path_buf(), move_path: None, chunks: Vec::new(), }, absolute_update, ), ] { assert_eq!(hunk.resolve_path(&cwd), expected_path); } } #[test] fn test_parse_patch_lenient() { let patch_text = r#"*** Begin Patch *** Update File: file2.py import foo +bar *** End Patch"#; let expected_patch = vec![UpdateFile { path: PathBuf::from("file2.py"), move_path: None, chunks: vec![UpdateFileChunk { change_context: None, old_lines: vec!["import foo".to_string()], new_lines: vec!["import foo".to_string(), "bar".to_string()], is_end_of_file: false, }], }]; let expected_error = InvalidPatchError("The first line of the patch must be '*** Begin Patch'".to_string()); let patch_text_in_heredoc = format!("<