mirror of
https://github.com/openai/codex.git
synced 2026-04-24 22:54:54 +00:00
eval-case: implement eval capture bundles
This commit is contained in:
15
codex-rs/Cargo.lock
generated
15
codex-rs/Cargo.lock
generated
@@ -1000,6 +1000,7 @@ dependencies = [
|
||||
"codex-backend-client",
|
||||
"codex-common",
|
||||
"codex-core",
|
||||
"codex-eval-case",
|
||||
"codex-feedback",
|
||||
"codex-file-search",
|
||||
"codex-login",
|
||||
@@ -1350,6 +1351,19 @@ dependencies = [
|
||||
"wiremock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "codex-eval-case"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"pretty_assertions",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"time",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "codex-exec"
|
||||
version = "0.0.0"
|
||||
@@ -1724,6 +1738,7 @@ dependencies = [
|
||||
"codex-backend-client",
|
||||
"codex-common",
|
||||
"codex-core",
|
||||
"codex-eval-case",
|
||||
"codex-feedback",
|
||||
"codex-file-search",
|
||||
"codex-login",
|
||||
|
||||
@@ -19,6 +19,7 @@ members = [
|
||||
"exec-server",
|
||||
"execpolicy",
|
||||
"execpolicy-legacy",
|
||||
"eval-case",
|
||||
"keyring-store",
|
||||
"file-search",
|
||||
"linux-sandbox",
|
||||
@@ -75,6 +76,7 @@ codex-common = { path = "common" }
|
||||
codex-core = { path = "core" }
|
||||
codex-exec = { path = "exec" }
|
||||
codex-execpolicy = { path = "execpolicy" }
|
||||
codex-eval-case = { path = "eval-case" }
|
||||
codex-feedback = { path = "feedback" }
|
||||
codex-file-search = { path = "file-search" }
|
||||
codex-git = { path = "utils/git" }
|
||||
|
||||
@@ -174,6 +174,11 @@ client_request_definitions! {
|
||||
response: v2::FeedbackUploadResponse,
|
||||
},
|
||||
|
||||
EvalCaseCreate => "evalCase/create" {
|
||||
params: v2::EvalCaseCreateParams,
|
||||
response: v2::EvalCaseCreateResponse,
|
||||
},
|
||||
|
||||
/// Execute a command (argv vector) under the server's sandbox.
|
||||
OneOffCommandExec => "command/exec" {
|
||||
params: v2::CommandExecParams,
|
||||
|
||||
@@ -930,6 +930,50 @@ pub struct FeedbackUploadResponse {
|
||||
pub thread_id: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
#[ts(export_to = "v2/")]
|
||||
pub enum EvalCaseStartMarkerKind {
|
||||
RolloutLineTimestamp,
|
||||
RolloutLineIndex,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
#[serde(untagged)]
|
||||
#[ts(export_to = "v2/")]
|
||||
pub enum EvalCaseStartMarkerValue {
|
||||
Timestamp(String),
|
||||
LineIndex(u64),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
#[ts(export_to = "v2/")]
|
||||
pub struct EvalCaseStartMarker {
|
||||
pub kind: EvalCaseStartMarkerKind,
|
||||
pub value: EvalCaseStartMarkerValue,
|
||||
pub display: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
#[ts(export_to = "v2/")]
|
||||
pub struct EvalCaseCreateParams {
|
||||
pub thread_id: String,
|
||||
pub start: EvalCaseStartMarker,
|
||||
pub what_went_wrong: String,
|
||||
pub what_good_looks_like: String,
|
||||
pub include_logs: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
#[ts(export_to = "v2/")]
|
||||
pub struct EvalCaseCreateResponse {
|
||||
pub case_id: String,
|
||||
pub path: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
#[ts(export_to = "v2/")]
|
||||
|
||||
@@ -21,6 +21,7 @@ codex-arg0 = { workspace = true }
|
||||
codex-common = { workspace = true, features = ["cli"] }
|
||||
codex-core = { workspace = true }
|
||||
codex-backend-client = { workspace = true }
|
||||
codex-eval-case = { workspace = true }
|
||||
codex-file-search = { workspace = true }
|
||||
codex-login = { workspace = true }
|
||||
codex-protocol = { workspace = true }
|
||||
|
||||
@@ -25,6 +25,8 @@ use codex_app_server_protocol::ClientRequest;
|
||||
use codex_app_server_protocol::CommandExecParams;
|
||||
use codex_app_server_protocol::ConversationGitInfo;
|
||||
use codex_app_server_protocol::ConversationSummary;
|
||||
use codex_app_server_protocol::EvalCaseCreateParams;
|
||||
use codex_app_server_protocol::EvalCaseCreateResponse;
|
||||
use codex_app_server_protocol::ExecOneOffCommandResponse;
|
||||
use codex_app_server_protocol::FeedbackUploadParams;
|
||||
use codex_app_server_protocol::FeedbackUploadResponse;
|
||||
@@ -512,6 +514,9 @@ impl CodexMessageProcessor {
|
||||
ClientRequest::FeedbackUpload { request_id, params } => {
|
||||
self.upload_feedback(request_id, params).await;
|
||||
}
|
||||
ClientRequest::EvalCaseCreate { request_id, params } => {
|
||||
self.eval_case_create(request_id, params).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3301,6 +3306,170 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
async fn eval_case_create(&self, request_id: RequestId, params: EvalCaseCreateParams) {
|
||||
let EvalCaseCreateParams {
|
||||
thread_id,
|
||||
start,
|
||||
what_went_wrong,
|
||||
what_good_looks_like,
|
||||
include_logs,
|
||||
} = params;
|
||||
|
||||
if !include_logs {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INVALID_REQUEST_ERROR_CODE,
|
||||
message: "eval case bundles always include codex-logs.log; set include_logs=true"
|
||||
.to_string(),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
}
|
||||
|
||||
let conversation_id = match ConversationId::from_string(&thread_id) {
|
||||
Ok(id) => id,
|
||||
Err(err) => {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INVALID_REQUEST_ERROR_CODE,
|
||||
message: format!("invalid thread id: {err}"),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let Some(rollout_path) = self.resolve_rollout_path(conversation_id).await else {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INVALID_REQUEST_ERROR_CODE,
|
||||
message: "could not resolve rollout path for thread".to_string(),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
};
|
||||
|
||||
let rollout_text = match std::fs::read_to_string(&rollout_path) {
|
||||
Ok(text) => text,
|
||||
Err(err) => {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INTERNAL_ERROR_CODE,
|
||||
message: format!(
|
||||
"failed to read rollout file {}: {err}",
|
||||
rollout_path.display()
|
||||
),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let start_marker = match (&start.kind, &start.value) {
|
||||
(
|
||||
codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineTimestamp,
|
||||
codex_app_server_protocol::EvalCaseStartMarkerValue::Timestamp(value),
|
||||
) => codex_eval_case::StartMarker {
|
||||
kind: codex_eval_case::StartMarkerKind::RolloutLineTimestamp,
|
||||
value: codex_eval_case::StartMarkerValue::Timestamp(value.clone()),
|
||||
display: start.display.clone(),
|
||||
},
|
||||
(
|
||||
codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineIndex,
|
||||
codex_app_server_protocol::EvalCaseStartMarkerValue::LineIndex(value),
|
||||
) => codex_eval_case::StartMarker {
|
||||
kind: codex_eval_case::StartMarkerKind::RolloutLineIndex,
|
||||
value: codex_eval_case::StartMarkerValue::LineIndex(*value),
|
||||
display: start.display.clone(),
|
||||
},
|
||||
_ => {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INVALID_REQUEST_ERROR_CODE,
|
||||
message: "start marker kind/value mismatch".to_string(),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let last_line_index = rollout_text.lines().count().saturating_sub(1);
|
||||
let requested_start_is_now = start.display == "Start now"
|
||||
|| matches!((&start.kind, &start.value),
|
||||
(codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineIndex,
|
||||
codex_app_server_protocol::EvalCaseStartMarkerValue::LineIndex(index))
|
||||
if usize::try_from(*index).ok() == Some(last_line_index)
|
||||
);
|
||||
|
||||
let repo_snapshot = repo_snapshot_from_rollout(&rollout_text, &start_marker);
|
||||
if repo_snapshot.is_none() && !requested_start_is_now {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INVALID_REQUEST_ERROR_CODE,
|
||||
message: "no repo snapshot available for requested start marker".to_string(),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
}
|
||||
|
||||
let logs_bytes = include_logs.then(|| {
|
||||
self.feedback
|
||||
.snapshot(Some(conversation_id))
|
||||
.as_bytes()
|
||||
.to_vec()
|
||||
});
|
||||
|
||||
let args = codex_eval_case::CreateEvalCaseArgs {
|
||||
codex_home: self.config.codex_home.clone(),
|
||||
conversation_id: thread_id.clone(),
|
||||
rollout_path,
|
||||
start: start_marker,
|
||||
repo_cwd: self.config.cwd.clone(),
|
||||
repo_snapshot,
|
||||
notes: codex_eval_case::Notes {
|
||||
what_went_wrong,
|
||||
what_good_looks_like,
|
||||
},
|
||||
include_logs,
|
||||
logs_bytes,
|
||||
};
|
||||
|
||||
let result =
|
||||
tokio::task::spawn_blocking(move || codex_eval_case::create_eval_case_bundle(&args))
|
||||
.await;
|
||||
|
||||
let result = match result {
|
||||
Ok(outcome) => outcome,
|
||||
Err(join_err) => {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INTERNAL_ERROR_CODE,
|
||||
message: format!("failed to create eval case bundle: {join_err}"),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(outcome) => {
|
||||
let response = EvalCaseCreateResponse {
|
||||
case_id: outcome.case_id,
|
||||
path: outcome.path.display().to_string(),
|
||||
};
|
||||
self.outgoing.send_response(request_id, response).await;
|
||||
}
|
||||
Err(err) => {
|
||||
let error = JSONRPCErrorError {
|
||||
code: INTERNAL_ERROR_CODE,
|
||||
message: format!("failed to create eval case bundle: {err}"),
|
||||
data: None,
|
||||
};
|
||||
self.outgoing.send_error(request_id, error).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn resolve_rollout_path(&self, conversation_id: ConversationId) -> Option<PathBuf> {
|
||||
match self
|
||||
.conversation_manager
|
||||
@@ -3313,6 +3482,56 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
fn repo_snapshot_from_rollout(
|
||||
rollout_text: &str,
|
||||
start_marker: &codex_eval_case::StartMarker,
|
||||
) -> Option<codex_eval_case::RepoSnapshot> {
|
||||
match &start_marker.value {
|
||||
codex_eval_case::StartMarkerValue::LineIndex(index) => {
|
||||
let index = usize::try_from(*index).ok()?;
|
||||
repo_snapshot_from_rollout_line_index(rollout_text, index)
|
||||
}
|
||||
codex_eval_case::StartMarkerValue::Timestamp(timestamp) => {
|
||||
repo_snapshot_from_rollout_timestamp(rollout_text, timestamp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn repo_snapshot_from_rollout_timestamp(
|
||||
rollout_text: &str,
|
||||
timestamp: &str,
|
||||
) -> Option<codex_eval_case::RepoSnapshot> {
|
||||
let start_index = rollout_text.lines().enumerate().find_map(|(idx, line)| {
|
||||
let rollout_line =
|
||||
serde_json::from_str::<codex_protocol::protocol::RolloutLine>(line).ok()?;
|
||||
(rollout_line.timestamp == timestamp).then_some(idx)
|
||||
})?;
|
||||
repo_snapshot_from_rollout_line_index(rollout_text, start_index)
|
||||
}
|
||||
|
||||
fn repo_snapshot_from_rollout_line_index(
|
||||
rollout_text: &str,
|
||||
start_index: usize,
|
||||
) -> Option<codex_eval_case::RepoSnapshot> {
|
||||
rollout_text
|
||||
.lines()
|
||||
.enumerate()
|
||||
.skip(start_index.saturating_add(1))
|
||||
.find_map(|(_idx, line)| {
|
||||
let rollout_line =
|
||||
serde_json::from_str::<codex_protocol::protocol::RolloutLine>(line).ok()?;
|
||||
match rollout_line.item {
|
||||
codex_protocol::protocol::RolloutItem::ResponseItem(
|
||||
codex_protocol::models::ResponseItem::GhostSnapshot { ghost_commit },
|
||||
) => Some(codex_eval_case::RepoSnapshot {
|
||||
base_sha: ghost_commit.parent()?.to_string(),
|
||||
commit_sha: ghost_commit.id().to_string(),
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn skills_to_info(
|
||||
skills: &[codex_core::skills::SkillMetadata],
|
||||
) -> Vec<codex_app_server_protocol::SkillMetadata> {
|
||||
@@ -3619,4 +3838,58 @@ mod tests {
|
||||
assert_eq!(summary, expected);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn repo_snapshot_from_rollout_line_index_finds_next_ghost_snapshot() -> Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
let user_line = json!({
|
||||
"timestamp": "2025-09-05T16:53:11.850Z",
|
||||
"type": "response_item",
|
||||
"payload": {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "input_text",
|
||||
"text": "hi",
|
||||
}],
|
||||
},
|
||||
});
|
||||
|
||||
let ghost_line = json!({
|
||||
"timestamp": "2025-09-05T16:53:12.000Z",
|
||||
"type": "response_item",
|
||||
"payload": {
|
||||
"type": "ghost_snapshot",
|
||||
"ghost_commit": {
|
||||
"id": "ghost-sha",
|
||||
"parent": "base-sha",
|
||||
"preexisting_untracked_files": [],
|
||||
"preexisting_untracked_dirs": [],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
let rollout_text = format!(
|
||||
"{}\n{}\n",
|
||||
serde_json::to_string(&user_line)?,
|
||||
serde_json::to_string(&ghost_line)?
|
||||
);
|
||||
|
||||
let start = codex_eval_case::StartMarker {
|
||||
kind: codex_eval_case::StartMarkerKind::RolloutLineIndex,
|
||||
value: codex_eval_case::StartMarkerValue::LineIndex(0),
|
||||
display: "From: hi".to_string(),
|
||||
};
|
||||
|
||||
let snapshot = repo_snapshot_from_rollout(&rollout_text, &start).expect("snapshot");
|
||||
assert_eq!(
|
||||
snapshot,
|
||||
codex_eval_case::RepoSnapshot {
|
||||
base_sha: "base-sha".to_string(),
|
||||
commit_sha: "ghost-sha".to_string(),
|
||||
}
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,34 +209,19 @@ pub fn apply_hunks(
|
||||
stdout: &mut impl std::io::Write,
|
||||
stderr: &mut impl std::io::Write,
|
||||
) -> Result<(), ApplyPatchError> {
|
||||
let _existing_paths: Vec<&Path> = hunks
|
||||
.iter()
|
||||
.filter_map(|hunk| match hunk {
|
||||
Hunk::AddFile { .. } => {
|
||||
// The file is being added, so it doesn't exist yet.
|
||||
None
|
||||
}
|
||||
Hunk::DeleteFile { path } => Some(path.as_path()),
|
||||
Hunk::UpdateFile {
|
||||
path, move_path, ..
|
||||
} => match move_path {
|
||||
Some(move_path) => {
|
||||
if std::fs::metadata(move_path)
|
||||
.map(|m| m.is_file())
|
||||
.unwrap_or(false)
|
||||
{
|
||||
Some(move_path.as_path())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
None => Some(path.as_path()),
|
||||
},
|
||||
})
|
||||
.collect::<Vec<&Path>>();
|
||||
apply_hunks_in_dir(hunks, Path::new("."), stdout, stderr)
|
||||
}
|
||||
|
||||
/// Applies hunks relative to `cwd` (without mutating the process working directory) and writes the
|
||||
/// same stdout/stderr output as `apply_hunks`.
|
||||
pub fn apply_hunks_in_dir(
|
||||
hunks: &[Hunk],
|
||||
cwd: &Path,
|
||||
stdout: &mut impl std::io::Write,
|
||||
stderr: &mut impl std::io::Write,
|
||||
) -> Result<(), ApplyPatchError> {
|
||||
// Delegate to a helper that applies each hunk to the filesystem.
|
||||
match apply_hunks_to_files(hunks) {
|
||||
match apply_hunks_to_files_in_dir(hunks, cwd) {
|
||||
Ok(affected) => {
|
||||
print_summary(&affected, stdout).map_err(ApplyPatchError::from)?;
|
||||
Ok(())
|
||||
@@ -267,7 +252,7 @@ pub struct AffectedPaths {
|
||||
|
||||
/// Apply the hunks to the filesystem, returning which files were added, modified, or deleted.
|
||||
/// Returns an error if the patch could not be applied.
|
||||
fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result<AffectedPaths> {
|
||||
fn apply_hunks_to_files_in_dir(hunks: &[Hunk], cwd: &Path) -> anyhow::Result<AffectedPaths> {
|
||||
if hunks.is_empty() {
|
||||
anyhow::bail!("No files were modified.");
|
||||
}
|
||||
@@ -278,19 +263,29 @@ fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result<AffectedPaths> {
|
||||
for hunk in hunks {
|
||||
match hunk {
|
||||
Hunk::AddFile { path, contents } => {
|
||||
if let Some(parent) = path.parent()
|
||||
let target_path = if path.is_absolute() {
|
||||
path.clone()
|
||||
} else {
|
||||
cwd.join(path)
|
||||
};
|
||||
if let Some(parent) = target_path.parent()
|
||||
&& !parent.as_os_str().is_empty()
|
||||
{
|
||||
std::fs::create_dir_all(parent).with_context(|| {
|
||||
format!("Failed to create parent directories for {}", path.display())
|
||||
})?;
|
||||
}
|
||||
std::fs::write(path, contents)
|
||||
std::fs::write(&target_path, contents)
|
||||
.with_context(|| format!("Failed to write file {}", path.display()))?;
|
||||
added.push(path.clone());
|
||||
}
|
||||
Hunk::DeleteFile { path } => {
|
||||
std::fs::remove_file(path)
|
||||
let target_path = if path.is_absolute() {
|
||||
path.clone()
|
||||
} else {
|
||||
cwd.join(path)
|
||||
};
|
||||
std::fs::remove_file(&target_path)
|
||||
.with_context(|| format!("Failed to delete file {}", path.display()))?;
|
||||
deleted.push(path.clone());
|
||||
}
|
||||
@@ -300,22 +295,38 @@ fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result<AffectedPaths> {
|
||||
chunks,
|
||||
} => {
|
||||
let AppliedPatch { new_contents, .. } =
|
||||
derive_new_contents_from_chunks(path, chunks)?;
|
||||
derive_new_contents_from_chunks_in_dir(path, chunks, cwd)?;
|
||||
if let Some(dest) = move_path {
|
||||
if let Some(parent) = dest.parent()
|
||||
let dest_path = if dest.is_absolute() {
|
||||
dest.clone()
|
||||
} else {
|
||||
cwd.join(dest)
|
||||
};
|
||||
if let Some(parent) = dest_path.parent()
|
||||
&& !parent.as_os_str().is_empty()
|
||||
{
|
||||
std::fs::create_dir_all(parent).with_context(|| {
|
||||
format!("Failed to create parent directories for {}", dest.display())
|
||||
})?;
|
||||
}
|
||||
std::fs::write(dest, new_contents)
|
||||
std::fs::write(&dest_path, new_contents)
|
||||
.with_context(|| format!("Failed to write file {}", dest.display()))?;
|
||||
std::fs::remove_file(path)
|
||||
|
||||
let src_path = if path.is_absolute() {
|
||||
path.clone()
|
||||
} else {
|
||||
cwd.join(path)
|
||||
};
|
||||
std::fs::remove_file(&src_path)
|
||||
.with_context(|| format!("Failed to remove original {}", path.display()))?;
|
||||
modified.push(dest.clone());
|
||||
} else {
|
||||
std::fs::write(path, new_contents)
|
||||
let target_path = if path.is_absolute() {
|
||||
path.clone()
|
||||
} else {
|
||||
cwd.join(path)
|
||||
};
|
||||
std::fs::write(&target_path, new_contents)
|
||||
.with_context(|| format!("Failed to write file {}", path.display()))?;
|
||||
modified.push(path.clone());
|
||||
}
|
||||
@@ -340,7 +351,21 @@ fn derive_new_contents_from_chunks(
|
||||
path: &Path,
|
||||
chunks: &[UpdateFileChunk],
|
||||
) -> std::result::Result<AppliedPatch, ApplyPatchError> {
|
||||
let original_contents = match std::fs::read_to_string(path) {
|
||||
derive_new_contents_from_chunks_in_dir(path, chunks, Path::new("."))
|
||||
}
|
||||
|
||||
fn derive_new_contents_from_chunks_in_dir(
|
||||
path: &Path,
|
||||
chunks: &[UpdateFileChunk],
|
||||
cwd: &Path,
|
||||
) -> std::result::Result<AppliedPatch, ApplyPatchError> {
|
||||
let target_path = if path.is_absolute() {
|
||||
path.to_path_buf()
|
||||
} else {
|
||||
cwd.join(path)
|
||||
};
|
||||
|
||||
let original_contents = match std::fs::read_to_string(&target_path) {
|
||||
Ok(contents) => contents,
|
||||
Err(err) => {
|
||||
return Err(ApplyPatchError::IoError(IoError {
|
||||
|
||||
@@ -343,6 +343,7 @@ pub(crate) struct Session {
|
||||
/// The set of enabled features should be invariant for the lifetime of the
|
||||
/// session.
|
||||
features: Features,
|
||||
repo_snapshotting_enabled: AtomicBool,
|
||||
pub(crate) active_turn: Mutex<Option<ActiveTurn>>,
|
||||
pub(crate) services: SessionServices,
|
||||
next_internal_sub_id: AtomicU64,
|
||||
@@ -673,6 +674,7 @@ impl Session {
|
||||
tx_event: tx_event.clone(),
|
||||
state: Mutex::new(state),
|
||||
features: config.features.clone(),
|
||||
repo_snapshotting_enabled: AtomicBool::new(false),
|
||||
active_turn: Mutex::new(None),
|
||||
services,
|
||||
next_internal_sub_id: AtomicU64::new(0),
|
||||
@@ -1439,7 +1441,9 @@ impl Session {
|
||||
turn_context: Arc<TurnContext>,
|
||||
cancellation_token: CancellationToken,
|
||||
) {
|
||||
if !self.enabled(Feature::GhostCommit) {
|
||||
if !self.enabled(Feature::GhostCommit)
|
||||
&& !self.repo_snapshotting_enabled.load(Ordering::Relaxed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
let token = match turn_context.tool_call_gate.subscribe().await {
|
||||
@@ -1609,6 +1613,9 @@ async fn submission_loop(sess: Arc<Session>, config: Arc<Config>, rx_sub: Receiv
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Op::SetRepoSnapshotting { enabled } => {
|
||||
handlers::set_repo_snapshotting(&sess, enabled).await;
|
||||
}
|
||||
Op::UserInput { .. } | Op::UserTurn { .. } => {
|
||||
handlers::user_input_or_turn(&sess, sub.id.clone(), sub.op, &mut previous_context)
|
||||
.await;
|
||||
@@ -1707,6 +1714,7 @@ mod handlers {
|
||||
use mcp_types::RequestId;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::Ordering;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
|
||||
@@ -1731,6 +1739,11 @@ mod handlers {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn set_repo_snapshotting(sess: &Session, enabled: bool) {
|
||||
sess.repo_snapshotting_enabled
|
||||
.store(enabled, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub async fn user_input_or_turn(
|
||||
sess: &Arc<Session>,
|
||||
sub_id: String,
|
||||
@@ -3161,6 +3174,7 @@ mod tests {
|
||||
tx_event,
|
||||
state: Mutex::new(state),
|
||||
features: config.features.clone(),
|
||||
repo_snapshotting_enabled: AtomicBool::new(false),
|
||||
active_turn: Mutex::new(None),
|
||||
services,
|
||||
next_internal_sub_id: AtomicU64::new(0),
|
||||
@@ -3248,6 +3262,7 @@ mod tests {
|
||||
tx_event,
|
||||
state: Mutex::new(state),
|
||||
features: config.features.clone(),
|
||||
repo_snapshotting_enabled: AtomicBool::new(false),
|
||||
active_turn: Mutex::new(None),
|
||||
services,
|
||||
next_internal_sub_id: AtomicU64::new(0),
|
||||
|
||||
@@ -23,6 +23,7 @@ use codex_protocol::protocol::ReviewDecision;
|
||||
use futures::future::BoxFuture;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ApplyPatchRequest {
|
||||
@@ -143,6 +144,48 @@ impl ToolRuntime<ApplyPatchRequest, ExecToolCallOutput> for ApplyPatchRuntime {
|
||||
attempt: &SandboxAttempt<'_>,
|
||||
ctx: &ToolCtx<'_>,
|
||||
) -> Result<ExecToolCallOutput, ToolError> {
|
||||
// When there is no sandbox in play (DangerFullAccess / bypassed sandbox), apply the patch
|
||||
// in-process. This avoids relying on the current executable implementing the arg0/argv1
|
||||
// dispatch behavior (which is not true for unit/integration test binaries).
|
||||
if attempt.sandbox == crate::exec::SandboxType::None {
|
||||
let started = Instant::now();
|
||||
let mut stdout = Vec::new();
|
||||
let mut stderr = Vec::new();
|
||||
|
||||
// Apply the patch relative to `req.cwd` without mutating the process CWD.
|
||||
let exit_code = match codex_apply_patch::parse_patch(&req.patch) {
|
||||
Ok(args) => {
|
||||
match codex_apply_patch::apply_hunks_in_dir(
|
||||
&args.hunks,
|
||||
&req.cwd,
|
||||
&mut stdout,
|
||||
&mut stderr,
|
||||
) {
|
||||
Ok(()) => 0,
|
||||
Err(_) => 1,
|
||||
}
|
||||
}
|
||||
// Reuse codex-apply-patch's error formatting for parse errors.
|
||||
Err(_) => {
|
||||
match codex_apply_patch::apply_patch(&req.patch, &mut stdout, &mut stderr) {
|
||||
Ok(()) => 0,
|
||||
Err(_) => 1,
|
||||
}
|
||||
}
|
||||
};
|
||||
let stdout = String::from_utf8_lossy(&stdout).to_string();
|
||||
let stderr = String::from_utf8_lossy(&stderr).to_string();
|
||||
let aggregated_output = format!("{stdout}{stderr}");
|
||||
return Ok(ExecToolCallOutput {
|
||||
exit_code,
|
||||
stdout: crate::exec::StreamOutput::new(stdout),
|
||||
stderr: crate::exec::StreamOutput::new(stderr),
|
||||
aggregated_output: crate::exec::StreamOutput::new(aggregated_output),
|
||||
duration: started.elapsed(),
|
||||
timed_out: false,
|
||||
});
|
||||
}
|
||||
|
||||
let spec = Self::build_command_spec(req)?;
|
||||
let env = attempt
|
||||
.env_for(spec)
|
||||
|
||||
@@ -8,6 +8,7 @@ use codex_core::protocol::Op;
|
||||
use codex_core::protocol::SandboxPolicy;
|
||||
use codex_protocol::config_types::ReasoningSummary;
|
||||
use codex_protocol::user_input::UserInput;
|
||||
use core_test_support::responses::ResponsesRequest;
|
||||
use core_test_support::responses::ev_assistant_message;
|
||||
use core_test_support::responses::ev_completed;
|
||||
use core_test_support::responses::ev_function_call;
|
||||
@@ -277,7 +278,7 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> {
|
||||
ev_completed("resp-2"),
|
||||
]),
|
||||
];
|
||||
mount_sse_sequence(harness.server(), responses).await;
|
||||
let requests = mount_sse_sequence(harness.server(), responses).await;
|
||||
|
||||
let model = test.session_configured.model.clone();
|
||||
codex
|
||||
@@ -297,7 +298,14 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> {
|
||||
|
||||
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
|
||||
|
||||
assert_eq!(fs::read_to_string(&target).await?, "hello from snapshot\n");
|
||||
let tool_output = extract_call_output_text(&requests.requests(), call_id);
|
||||
let target_contents = fs::read_to_string(&target).await.unwrap_or_else(|err| {
|
||||
panic!(
|
||||
"expected patch to create {}: {err}; tool output={tool_output:?}",
|
||||
target.display()
|
||||
)
|
||||
});
|
||||
assert_eq!(target_contents, "hello from snapshot\n");
|
||||
|
||||
let mut entries = fs::read_dir(codex_home.join("shell_snapshots")).await?;
|
||||
let snapshot_path = entries
|
||||
@@ -311,6 +319,35 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn extract_call_output_text(requests: &[ResponsesRequest], call_id: &str) -> String {
|
||||
for req in requests {
|
||||
let input = req.input();
|
||||
let item = input.iter().find(|item| {
|
||||
item.get("type").and_then(serde_json::Value::as_str) == Some("function_call_output")
|
||||
&& item.get("call_id").and_then(serde_json::Value::as_str) == Some(call_id)
|
||||
});
|
||||
let Some(item) = item else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let output = item
|
||||
.get("output")
|
||||
.cloned()
|
||||
.unwrap_or(serde_json::Value::Null);
|
||||
match output {
|
||||
serde_json::Value::String(text) => return text,
|
||||
serde_json::Value::Object(obj) => {
|
||||
if let Some(text) = obj.get("content").and_then(serde_json::Value::as_str) {
|
||||
return text.to_string();
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
"<missing tool output>".to_string()
|
||||
}
|
||||
|
||||
#[cfg_attr(target_os = "windows", ignore)]
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn shell_snapshot_deleted_after_shutdown_with_skills() -> Result<()> {
|
||||
|
||||
44
codex-rs/docs/eval_capture.md
Normal file
44
codex-rs/docs/eval_capture.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# Eval Capture Bundles
|
||||
|
||||
Codex can capture "eval case" bundles from the `/feedback` flow (bad result -> capture eval sample).
|
||||
These bundles are meant to turn real failures into reproducible, local-first artifacts.
|
||||
|
||||
## Where Bundles Are Written
|
||||
|
||||
Bundles are stored under:
|
||||
|
||||
`$CODEX_HOME/eval-case/<case-id>/`
|
||||
|
||||
## Bundle Contents
|
||||
|
||||
Each bundle contains:
|
||||
|
||||
- `manifest.json` - metadata about the capture (schema version, start marker, notes, repo base).
|
||||
- `rollout.jsonl` - the full session rollout (multi-turn trajectory).
|
||||
- `repo.patch` - a git patch representing the repository state at the chosen start marker.
|
||||
- `codex-logs.log` - tracing logs to help maintainers debug the session.
|
||||
|
||||
## Start Marker And Repo State
|
||||
|
||||
Bundles include the entire rollout, but also record a start marker to indicate where an eval
|
||||
harness (or a human) should begin replaying/interpreting the trajectory.
|
||||
|
||||
The repository patch must match that chosen start marker:
|
||||
|
||||
- If the session has repo snapshots available, `repo.patch` is derived from the ghost snapshot
|
||||
commit associated with the selected user turn (diff from the snapshot's base commit to the
|
||||
snapshot commit).
|
||||
- If no snapshot is available for a given start marker, the TUI disables that option (and may
|
||||
fall back to the basic feedback flow instead).
|
||||
|
||||
For reproducibility outside your machine, the base commit recorded in `manifest.json` should be
|
||||
reachable by maintainers (for example, pushed and available on the default branch).
|
||||
|
||||
## App-Server API (For Integrations)
|
||||
|
||||
Non-TUI clients can create bundles via the app-server JSON-RPC method:
|
||||
|
||||
- `evalCase/create`
|
||||
|
||||
The handler copies the rollout into the bundle and derives `repo.patch` based on the selected start
|
||||
marker when repo snapshots are available.
|
||||
20
codex-rs/eval-case/Cargo.toml
Normal file
20
codex-rs/eval-case/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "codex-eval-case"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true, features = ["formatting"] }
|
||||
uuid = { workspace = true, features = ["v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
|
||||
516
codex-rs/eval-case/src/lib.rs
Normal file
516
codex-rs/eval-case/src/lib.rs
Normal file
@@ -0,0 +1,516 @@
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
use anyhow::Context as _;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use time::OffsetDateTime;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum StartMarkerKind {
|
||||
RolloutLineTimestamp,
|
||||
RolloutLineIndex,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(untagged)]
|
||||
pub enum StartMarkerValue {
|
||||
Timestamp(String),
|
||||
LineIndex(u64),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct StartMarker {
|
||||
pub kind: StartMarkerKind,
|
||||
pub value: StartMarkerValue,
|
||||
pub display: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct GitBase {
|
||||
pub sha: String,
|
||||
pub note: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct RolloutInfo {
|
||||
pub filename: String,
|
||||
pub start: StartMarker,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct RepoInfo {
|
||||
pub cwd: String,
|
||||
pub git_base: GitBase,
|
||||
pub patch_filename: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Notes {
|
||||
pub what_went_wrong: String,
|
||||
pub what_good_looks_like: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Artifacts {
|
||||
pub include_logs: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct EvalCaseManifestV0 {
|
||||
pub version: String,
|
||||
pub case_id: String,
|
||||
pub created_at: String,
|
||||
pub conversation_id: String,
|
||||
pub source: String,
|
||||
pub rollout: RolloutInfo,
|
||||
pub repo: RepoInfo,
|
||||
pub notes: Notes,
|
||||
pub artifacts: Artifacts,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CreateEvalCaseArgs {
|
||||
pub codex_home: PathBuf,
|
||||
pub conversation_id: String,
|
||||
pub rollout_path: PathBuf,
|
||||
pub start: StartMarker,
|
||||
pub repo_cwd: PathBuf,
|
||||
/// When present, derive `repo.patch` from the provided commit snapshot instead of the current
|
||||
/// working tree.
|
||||
pub repo_snapshot: Option<RepoSnapshot>,
|
||||
pub notes: Notes,
|
||||
pub include_logs: bool,
|
||||
pub logs_bytes: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RepoSnapshot {
|
||||
pub base_sha: String,
|
||||
pub commit_sha: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CreateEvalCaseResult {
|
||||
pub case_id: String,
|
||||
pub path: PathBuf,
|
||||
}
|
||||
|
||||
fn sanitize_repo_slug(input: &str) -> String {
|
||||
// Keep it short + filesystem-friendly.
|
||||
let mut out = String::with_capacity(input.len());
|
||||
let mut last_was_dash = false;
|
||||
for ch in input.chars() {
|
||||
let ch = ch.to_ascii_lowercase();
|
||||
if ch.is_ascii_alphanumeric() {
|
||||
out.push(ch);
|
||||
last_was_dash = false;
|
||||
continue;
|
||||
}
|
||||
if !last_was_dash {
|
||||
out.push('-');
|
||||
last_was_dash = true;
|
||||
}
|
||||
}
|
||||
let out = out.trim_matches('-').to_string();
|
||||
if out.is_empty() {
|
||||
"repo".to_string()
|
||||
} else {
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
fn repo_slug(repo_cwd: &Path) -> String {
|
||||
// Prefer the git repo root name, but fall back to the cwd basename.
|
||||
let top_level = git_stdout(repo_cwd, &["rev-parse", "--show-toplevel"])
|
||||
.ok()
|
||||
.map(PathBuf::from);
|
||||
let basename = top_level
|
||||
.as_ref()
|
||||
.and_then(|p| p.file_name())
|
||||
.or_else(|| repo_cwd.file_name());
|
||||
let basename = basename
|
||||
.map(|name| name.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| "repo".to_string());
|
||||
sanitize_repo_slug(&basename)
|
||||
}
|
||||
|
||||
pub fn create_eval_case_bundle(args: &CreateEvalCaseArgs) -> anyhow::Result<CreateEvalCaseResult> {
|
||||
let created_at = OffsetDateTime::now_utc();
|
||||
let created_at_rfc3339 = created_at.format(&Rfc3339).context("format created_at")?;
|
||||
|
||||
let ts_for_id = format!(
|
||||
"{:04}-{:02}-{:02}T{:02}-{:02}-{:02}",
|
||||
created_at.year(),
|
||||
u8::from(created_at.month()),
|
||||
created_at.day(),
|
||||
created_at.hour(),
|
||||
created_at.minute(),
|
||||
created_at.second()
|
||||
);
|
||||
|
||||
// Short, human-scannable id: datetime + repo + 6-digit suffix.
|
||||
// Collision risk is low and acceptable for local bundles.
|
||||
let repo = repo_slug(&args.repo_cwd);
|
||||
let id6 = (Uuid::new_v4().as_u128() % 1_000_000) as u32;
|
||||
let case_id = format!("{ts_for_id}-{repo}-{id6:06}");
|
||||
|
||||
let bundle_dir = args.codex_home.join("eval-case").join(&case_id);
|
||||
std::fs::create_dir_all(&bundle_dir)
|
||||
.with_context(|| format!("create eval bundle dir {}", bundle_dir.display()))?;
|
||||
|
||||
let rollout_dst = bundle_dir.join("rollout.jsonl");
|
||||
std::fs::copy(&args.rollout_path, &rollout_dst).with_context(|| {
|
||||
format!(
|
||||
"copy rollout {} -> {}",
|
||||
args.rollout_path.display(),
|
||||
rollout_dst.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let (base_sha, patch) = match args.repo_snapshot.as_ref() {
|
||||
Some(snapshot) => {
|
||||
git_patch_between_commits(&args.repo_cwd, &snapshot.base_sha, &snapshot.commit_sha)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"generate patch for repo snapshot {}..{}",
|
||||
snapshot.base_sha, snapshot.commit_sha
|
||||
)
|
||||
})?
|
||||
}
|
||||
None => git_patch_against_head(&args.repo_cwd)?,
|
||||
};
|
||||
let patch_path = bundle_dir.join("repo.patch");
|
||||
std::fs::write(&patch_path, patch)
|
||||
.with_context(|| format!("write patch {}", patch_path.display()))?;
|
||||
|
||||
if args.include_logs {
|
||||
let logs_path = bundle_dir.join("codex-logs.log");
|
||||
let bytes = args
|
||||
.logs_bytes
|
||||
.clone()
|
||||
.unwrap_or_else(|| Vec::with_capacity(0));
|
||||
std::fs::write(&logs_path, bytes)
|
||||
.with_context(|| format!("write logs {}", logs_path.display()))?;
|
||||
}
|
||||
|
||||
let manifest = EvalCaseManifestV0 {
|
||||
version: "v0".to_string(),
|
||||
case_id: case_id.clone(),
|
||||
created_at: created_at_rfc3339,
|
||||
conversation_id: args.conversation_id.clone(),
|
||||
source: "cli".to_string(),
|
||||
rollout: RolloutInfo {
|
||||
filename: "rollout.jsonl".to_string(),
|
||||
start: args.start.clone(),
|
||||
},
|
||||
repo: RepoInfo {
|
||||
cwd: args.repo_cwd.display().to_string(),
|
||||
git_base: GitBase {
|
||||
sha: base_sha,
|
||||
note: "For reproducibility, the base commit should be reachable (e.g. pushed / on main)."
|
||||
.to_string(),
|
||||
},
|
||||
patch_filename: "repo.patch".to_string(),
|
||||
},
|
||||
notes: args.notes.clone(),
|
||||
artifacts: Artifacts {
|
||||
include_logs: args.include_logs,
|
||||
},
|
||||
};
|
||||
let manifest_path = bundle_dir.join("manifest.json");
|
||||
let manifest_json = serde_json::to_string_pretty(&manifest).context("serialize manifest")?;
|
||||
std::fs::write(&manifest_path, format!("{manifest_json}\n"))
|
||||
.with_context(|| format!("write manifest {}", manifest_path.display()))?;
|
||||
|
||||
Ok(CreateEvalCaseResult {
|
||||
case_id,
|
||||
path: bundle_dir,
|
||||
})
|
||||
}
|
||||
|
||||
fn git_patch_against_head(repo_cwd: &Path) -> anyhow::Result<(String, Vec<u8>)> {
|
||||
let base_sha =
|
||||
git_stdout(repo_cwd, &["rev-parse", "HEAD"]).unwrap_or_else(|_| "unknown".to_string());
|
||||
|
||||
let mut patch = Vec::new();
|
||||
if let Ok(mut bytes) = git_diff(
|
||||
repo_cwd,
|
||||
&["diff", "--no-textconv", "--no-ext-diff", "--binary", "HEAD"],
|
||||
) {
|
||||
patch.append(&mut bytes);
|
||||
}
|
||||
|
||||
let untracked =
|
||||
git_stdout(repo_cwd, &["ls-files", "--others", "--exclude-standard"]).unwrap_or_default();
|
||||
for file in untracked.lines().map(str::trim).filter(|s| !s.is_empty()) {
|
||||
let null_device = if cfg!(windows) { "NUL" } else { "/dev/null" };
|
||||
let args = [
|
||||
"diff",
|
||||
"--no-textconv",
|
||||
"--no-ext-diff",
|
||||
"--binary",
|
||||
"--no-index",
|
||||
"--",
|
||||
null_device,
|
||||
file,
|
||||
];
|
||||
if let Ok(mut bytes) = git_diff(repo_cwd, &args) {
|
||||
patch.append(&mut bytes);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((base_sha, patch))
|
||||
}
|
||||
|
||||
fn git_stdout(repo_cwd: &Path, args: &[&str]) -> anyhow::Result<String> {
|
||||
let output = Command::new("git")
|
||||
.args(args)
|
||||
.current_dir(repo_cwd)
|
||||
.output()
|
||||
.with_context(|| format!("run git {}", args.join(" ")))?;
|
||||
if !output.status.success() {
|
||||
anyhow::bail!("git {} failed with {}", args.join(" "), output.status);
|
||||
}
|
||||
let out = String::from_utf8(output.stdout).context("decode git stdout")?;
|
||||
Ok(out.trim().to_string())
|
||||
}
|
||||
|
||||
fn git_diff(repo_cwd: &Path, args: &[&str]) -> anyhow::Result<Vec<u8>> {
|
||||
let output = Command::new("git")
|
||||
.args(args)
|
||||
.current_dir(repo_cwd)
|
||||
.output()
|
||||
.with_context(|| format!("run git {}", args.join(" ")))?;
|
||||
|
||||
let exit_ok = output.status.code().is_some_and(|c| c == 0 || c == 1);
|
||||
if !exit_ok {
|
||||
anyhow::bail!("git {} failed with {}", args.join(" "), output.status);
|
||||
}
|
||||
Ok(output.stdout)
|
||||
}
|
||||
|
||||
fn git_patch_between_commits(
|
||||
repo_cwd: &Path,
|
||||
base_sha: &str,
|
||||
commit_sha: &str,
|
||||
) -> anyhow::Result<(String, Vec<u8>)> {
|
||||
let patch = git_diff(
|
||||
repo_cwd,
|
||||
&[
|
||||
"diff",
|
||||
"--no-textconv",
|
||||
"--no-ext-diff",
|
||||
"--binary",
|
||||
base_sha,
|
||||
commit_sha,
|
||||
],
|
||||
)?;
|
||||
Ok((base_sha.to_string(), patch))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn creates_bundle_with_manifest_and_rollout() {
|
||||
let codex_home = TempDir::new().unwrap();
|
||||
let repo_dir = TempDir::new().unwrap();
|
||||
let repo_root = repo_dir.path().join("my-repo");
|
||||
std::fs::create_dir_all(&repo_root).unwrap();
|
||||
std::fs::write(repo_root.join("README.md"), "hi\n").unwrap();
|
||||
let init_status = Command::new("git")
|
||||
.args(["init", "-q"])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(init_status.success());
|
||||
let add_status = Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(add_status.success());
|
||||
let commit_status = Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.name=codex",
|
||||
"-c",
|
||||
"user.email=codex@example.com",
|
||||
"commit",
|
||||
"-m",
|
||||
"init",
|
||||
"-q",
|
||||
])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(commit_status.success());
|
||||
|
||||
std::fs::write(repo_root.join("README.md"), "changed\n").unwrap();
|
||||
|
||||
let rollout_path = repo_root.join("rollout.jsonl");
|
||||
std::fs::write(&rollout_path, "line-1\nline-2\n").unwrap();
|
||||
|
||||
let args = CreateEvalCaseArgs {
|
||||
codex_home: codex_home.path().to_path_buf(),
|
||||
conversation_id: "conv-1".to_string(),
|
||||
rollout_path,
|
||||
start: StartMarker {
|
||||
kind: StartMarkerKind::RolloutLineIndex,
|
||||
value: StartMarkerValue::LineIndex(1),
|
||||
display: "Start now".to_string(),
|
||||
},
|
||||
repo_cwd: repo_root.clone(),
|
||||
repo_snapshot: None,
|
||||
notes: Notes {
|
||||
what_went_wrong: "bad".to_string(),
|
||||
what_good_looks_like: "good".to_string(),
|
||||
},
|
||||
include_logs: true,
|
||||
logs_bytes: Some(b"logs".to_vec()),
|
||||
};
|
||||
|
||||
let out = create_eval_case_bundle(&args).unwrap();
|
||||
assert!(!out.case_id.is_empty());
|
||||
assert!(out.path.exists());
|
||||
assert_eq!(out.path.file_name().unwrap(), out.case_id.as_str());
|
||||
assert!(out.path.starts_with(codex_home.path().join("eval-case")));
|
||||
assert!(out.case_id.contains("my-repo"));
|
||||
|
||||
let manifest_text = std::fs::read_to_string(out.path.join("manifest.json")).unwrap();
|
||||
let manifest: EvalCaseManifestV0 = serde_json::from_str(&manifest_text).unwrap();
|
||||
assert_eq!(manifest.version, "v0");
|
||||
assert_eq!(manifest.conversation_id, "conv-1");
|
||||
assert_eq!(manifest.notes, args.notes);
|
||||
assert!(manifest.repo.git_base.sha != "unknown");
|
||||
assert_eq!(manifest.artifacts.include_logs, true);
|
||||
|
||||
assert!(out.path.join("repo.patch").exists());
|
||||
assert!(out.path.join("rollout.jsonl").exists());
|
||||
assert_eq!(
|
||||
std::fs::read(out.path.join("codex-logs.log")).unwrap(),
|
||||
b"logs".to_vec()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn creates_bundle_from_repo_snapshot_commit() {
|
||||
let codex_home = TempDir::new().unwrap();
|
||||
let repo_dir = TempDir::new().unwrap();
|
||||
let repo_root = repo_dir.path().join("my-repo");
|
||||
std::fs::create_dir_all(&repo_root).unwrap();
|
||||
|
||||
std::fs::write(repo_root.join("README.md"), "base\n").unwrap();
|
||||
let init_status = Command::new("git")
|
||||
.args(["init", "-q"])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(init_status.success());
|
||||
let add_status = Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(add_status.success());
|
||||
let commit_status = Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.name=codex",
|
||||
"-c",
|
||||
"user.email=codex@example.com",
|
||||
"commit",
|
||||
"-m",
|
||||
"base",
|
||||
"-q",
|
||||
])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(commit_status.success());
|
||||
|
||||
let base_sha = git_stdout(&repo_root, &["rev-parse", "HEAD"]).unwrap();
|
||||
|
||||
// Create a snapshot commit.
|
||||
std::fs::write(repo_root.join("README.md"), "snapshot\n").unwrap();
|
||||
std::fs::write(repo_root.join("snap.txt"), "snap\n").unwrap();
|
||||
let add_status = Command::new("git")
|
||||
.args(["add", "."])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(add_status.success());
|
||||
let commit_status = Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.name=codex",
|
||||
"-c",
|
||||
"user.email=codex@example.com",
|
||||
"commit",
|
||||
"-m",
|
||||
"snapshot",
|
||||
"-q",
|
||||
])
|
||||
.current_dir(&repo_root)
|
||||
.status()
|
||||
.unwrap();
|
||||
assert!(commit_status.success());
|
||||
let snapshot_sha = git_stdout(&repo_root, &["rev-parse", "HEAD"]).unwrap();
|
||||
|
||||
// Dirty the working tree after the snapshot commit; this should not affect repo.patch.
|
||||
std::fs::write(repo_root.join("README.md"), "worktree\n").unwrap();
|
||||
|
||||
let rollout_path = repo_root.join("rollout.jsonl");
|
||||
std::fs::write(&rollout_path, "line-1\nline-2\n").unwrap();
|
||||
|
||||
let args = CreateEvalCaseArgs {
|
||||
codex_home: codex_home.path().to_path_buf(),
|
||||
conversation_id: "conv-2".to_string(),
|
||||
rollout_path,
|
||||
start: StartMarker {
|
||||
kind: StartMarkerKind::RolloutLineIndex,
|
||||
value: StartMarkerValue::LineIndex(0),
|
||||
display: "From: test".to_string(),
|
||||
},
|
||||
repo_cwd: repo_root.clone(),
|
||||
repo_snapshot: Some(RepoSnapshot {
|
||||
base_sha: base_sha.clone(),
|
||||
commit_sha: snapshot_sha,
|
||||
}),
|
||||
notes: Notes {
|
||||
what_went_wrong: "bad".to_string(),
|
||||
what_good_looks_like: "good".to_string(),
|
||||
},
|
||||
include_logs: false,
|
||||
logs_bytes: None,
|
||||
};
|
||||
|
||||
let out = create_eval_case_bundle(&args).unwrap();
|
||||
assert_eq!(out.path.file_name().unwrap(), out.case_id.as_str());
|
||||
assert!(out.path.starts_with(codex_home.path().join("eval-case")));
|
||||
assert!(out.case_id.contains("my-repo"));
|
||||
let manifest_text = std::fs::read_to_string(out.path.join("manifest.json")).unwrap();
|
||||
let manifest: EvalCaseManifestV0 = serde_json::from_str(&manifest_text).unwrap();
|
||||
assert_eq!(manifest.repo.git_base.sha, base_sha);
|
||||
|
||||
let patch_text =
|
||||
String::from_utf8(std::fs::read(out.path.join("repo.patch")).unwrap()).unwrap();
|
||||
assert!(
|
||||
patch_text.contains("snapshot"),
|
||||
"patch should include snapshot commit changes"
|
||||
);
|
||||
assert!(
|
||||
!patch_text.contains("worktree"),
|
||||
"patch should not include post-snapshot working tree changes"
|
||||
);
|
||||
assert!(patch_text.contains("snap.txt"));
|
||||
}
|
||||
}
|
||||
@@ -155,8 +155,28 @@ pub struct CodexLogSnapshot {
|
||||
pub thread_id: String,
|
||||
}
|
||||
|
||||
pub struct FeedbackAttachment {
|
||||
pub filename: String,
|
||||
pub content_type: Option<String>,
|
||||
pub bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
impl FeedbackAttachment {
|
||||
pub fn new(filename: String, content_type: Option<String>, bytes: Vec<u8>) -> Self {
|
||||
Self {
|
||||
filename,
|
||||
content_type,
|
||||
bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CodexLogSnapshot {
|
||||
pub(crate) fn as_bytes(&self) -> &[u8] {
|
||||
pub fn from_bytes(thread_id: String, bytes: Vec<u8>) -> Self {
|
||||
Self { bytes, thread_id }
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.bytes
|
||||
}
|
||||
|
||||
@@ -176,6 +196,25 @@ impl CodexLogSnapshot {
|
||||
include_logs: bool,
|
||||
rollout_path: Option<&std::path::Path>,
|
||||
session_source: Option<SessionSource>,
|
||||
) -> Result<()> {
|
||||
self.upload_feedback_with_attachments(
|
||||
classification,
|
||||
reason,
|
||||
include_logs,
|
||||
rollout_path,
|
||||
session_source,
|
||||
Vec::new(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn upload_feedback_with_attachments(
|
||||
&self,
|
||||
classification: &str,
|
||||
reason: Option<&str>,
|
||||
include_logs: bool,
|
||||
rollout_path: Option<&std::path::Path>,
|
||||
session_source: Option<SessionSource>,
|
||||
extra_attachments: Vec<FeedbackAttachment>,
|
||||
) -> Result<()> {
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
@@ -265,6 +304,15 @@ impl CodexLogSnapshot {
|
||||
}));
|
||||
}
|
||||
|
||||
for attachment in extra_attachments {
|
||||
envelope.add_item(EnvelopeItem::Attachment(Attachment {
|
||||
buffer: attachment.bytes,
|
||||
filename: attachment.filename,
|
||||
content_type: attachment.content_type,
|
||||
ty: None,
|
||||
}));
|
||||
}
|
||||
|
||||
client.send_envelope(envelope);
|
||||
client.flush(Some(Duration::from_secs(UPLOAD_TIMEOUT_SECS)));
|
||||
Ok(())
|
||||
|
||||
@@ -141,6 +141,11 @@ pub enum Op {
|
||||
summary: Option<ReasoningSummaryConfig>,
|
||||
},
|
||||
|
||||
/// Enable or disable repository snapshots (ghost commits) for the remainder of the session.
|
||||
///
|
||||
/// When enabled, Codex will capture per-turn git snapshots and record them in the rollout.
|
||||
SetRepoSnapshotting { enabled: bool },
|
||||
|
||||
/// Approve a command execution
|
||||
ExecApproval {
|
||||
/// The id of the submission we are approving
|
||||
|
||||
Reference in New Issue
Block a user