eval-case: implement eval capture bundles

This commit is contained in:
Noah MacCallum
2026-01-01 03:20:12 -08:00
parent 602e5fca05
commit 049c395736
15 changed files with 1133 additions and 40 deletions

15
codex-rs/Cargo.lock generated
View File

@@ -1000,6 +1000,7 @@ dependencies = [
"codex-backend-client",
"codex-common",
"codex-core",
"codex-eval-case",
"codex-feedback",
"codex-file-search",
"codex-login",
@@ -1350,6 +1351,19 @@ dependencies = [
"wiremock",
]
[[package]]
name = "codex-eval-case"
version = "0.0.0"
dependencies = [
"anyhow",
"pretty_assertions",
"serde",
"serde_json",
"tempfile",
"time",
"uuid",
]
[[package]]
name = "codex-exec"
version = "0.0.0"
@@ -1724,6 +1738,7 @@ dependencies = [
"codex-backend-client",
"codex-common",
"codex-core",
"codex-eval-case",
"codex-feedback",
"codex-file-search",
"codex-login",

View File

@@ -19,6 +19,7 @@ members = [
"exec-server",
"execpolicy",
"execpolicy-legacy",
"eval-case",
"keyring-store",
"file-search",
"linux-sandbox",
@@ -75,6 +76,7 @@ codex-common = { path = "common" }
codex-core = { path = "core" }
codex-exec = { path = "exec" }
codex-execpolicy = { path = "execpolicy" }
codex-eval-case = { path = "eval-case" }
codex-feedback = { path = "feedback" }
codex-file-search = { path = "file-search" }
codex-git = { path = "utils/git" }

View File

@@ -174,6 +174,11 @@ client_request_definitions! {
response: v2::FeedbackUploadResponse,
},
EvalCaseCreate => "evalCase/create" {
params: v2::EvalCaseCreateParams,
response: v2::EvalCaseCreateResponse,
},
/// Execute a command (argv vector) under the server's sandbox.
OneOffCommandExec => "command/exec" {
params: v2::CommandExecParams,

View File

@@ -930,6 +930,50 @@ pub struct FeedbackUploadResponse {
pub thread_id: String,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "snake_case")]
#[ts(export_to = "v2/")]
pub enum EvalCaseStartMarkerKind {
RolloutLineTimestamp,
RolloutLineIndex,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(untagged)]
#[ts(export_to = "v2/")]
pub enum EvalCaseStartMarkerValue {
Timestamp(String),
LineIndex(u64),
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
pub struct EvalCaseStartMarker {
pub kind: EvalCaseStartMarkerKind,
pub value: EvalCaseStartMarkerValue,
pub display: String,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
pub struct EvalCaseCreateParams {
pub thread_id: String,
pub start: EvalCaseStartMarker,
pub what_went_wrong: String,
pub what_good_looks_like: String,
pub include_logs: bool,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
pub struct EvalCaseCreateResponse {
pub case_id: String,
pub path: String,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]

View File

@@ -21,6 +21,7 @@ codex-arg0 = { workspace = true }
codex-common = { workspace = true, features = ["cli"] }
codex-core = { workspace = true }
codex-backend-client = { workspace = true }
codex-eval-case = { workspace = true }
codex-file-search = { workspace = true }
codex-login = { workspace = true }
codex-protocol = { workspace = true }

View File

@@ -25,6 +25,8 @@ use codex_app_server_protocol::ClientRequest;
use codex_app_server_protocol::CommandExecParams;
use codex_app_server_protocol::ConversationGitInfo;
use codex_app_server_protocol::ConversationSummary;
use codex_app_server_protocol::EvalCaseCreateParams;
use codex_app_server_protocol::EvalCaseCreateResponse;
use codex_app_server_protocol::ExecOneOffCommandResponse;
use codex_app_server_protocol::FeedbackUploadParams;
use codex_app_server_protocol::FeedbackUploadResponse;
@@ -512,6 +514,9 @@ impl CodexMessageProcessor {
ClientRequest::FeedbackUpload { request_id, params } => {
self.upload_feedback(request_id, params).await;
}
ClientRequest::EvalCaseCreate { request_id, params } => {
self.eval_case_create(request_id, params).await;
}
}
}
@@ -3301,6 +3306,170 @@ impl CodexMessageProcessor {
}
}
async fn eval_case_create(&self, request_id: RequestId, params: EvalCaseCreateParams) {
let EvalCaseCreateParams {
thread_id,
start,
what_went_wrong,
what_good_looks_like,
include_logs,
} = params;
if !include_logs {
let error = JSONRPCErrorError {
code: INVALID_REQUEST_ERROR_CODE,
message: "eval case bundles always include codex-logs.log; set include_logs=true"
.to_string(),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
}
let conversation_id = match ConversationId::from_string(&thread_id) {
Ok(id) => id,
Err(err) => {
let error = JSONRPCErrorError {
code: INVALID_REQUEST_ERROR_CODE,
message: format!("invalid thread id: {err}"),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
}
};
let Some(rollout_path) = self.resolve_rollout_path(conversation_id).await else {
let error = JSONRPCErrorError {
code: INVALID_REQUEST_ERROR_CODE,
message: "could not resolve rollout path for thread".to_string(),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
};
let rollout_text = match std::fs::read_to_string(&rollout_path) {
Ok(text) => text,
Err(err) => {
let error = JSONRPCErrorError {
code: INTERNAL_ERROR_CODE,
message: format!(
"failed to read rollout file {}: {err}",
rollout_path.display()
),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
}
};
let start_marker = match (&start.kind, &start.value) {
(
codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineTimestamp,
codex_app_server_protocol::EvalCaseStartMarkerValue::Timestamp(value),
) => codex_eval_case::StartMarker {
kind: codex_eval_case::StartMarkerKind::RolloutLineTimestamp,
value: codex_eval_case::StartMarkerValue::Timestamp(value.clone()),
display: start.display.clone(),
},
(
codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineIndex,
codex_app_server_protocol::EvalCaseStartMarkerValue::LineIndex(value),
) => codex_eval_case::StartMarker {
kind: codex_eval_case::StartMarkerKind::RolloutLineIndex,
value: codex_eval_case::StartMarkerValue::LineIndex(*value),
display: start.display.clone(),
},
_ => {
let error = JSONRPCErrorError {
code: INVALID_REQUEST_ERROR_CODE,
message: "start marker kind/value mismatch".to_string(),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
}
};
let last_line_index = rollout_text.lines().count().saturating_sub(1);
let requested_start_is_now = start.display == "Start now"
|| matches!((&start.kind, &start.value),
(codex_app_server_protocol::EvalCaseStartMarkerKind::RolloutLineIndex,
codex_app_server_protocol::EvalCaseStartMarkerValue::LineIndex(index))
if usize::try_from(*index).ok() == Some(last_line_index)
);
let repo_snapshot = repo_snapshot_from_rollout(&rollout_text, &start_marker);
if repo_snapshot.is_none() && !requested_start_is_now {
let error = JSONRPCErrorError {
code: INVALID_REQUEST_ERROR_CODE,
message: "no repo snapshot available for requested start marker".to_string(),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
}
let logs_bytes = include_logs.then(|| {
self.feedback
.snapshot(Some(conversation_id))
.as_bytes()
.to_vec()
});
let args = codex_eval_case::CreateEvalCaseArgs {
codex_home: self.config.codex_home.clone(),
conversation_id: thread_id.clone(),
rollout_path,
start: start_marker,
repo_cwd: self.config.cwd.clone(),
repo_snapshot,
notes: codex_eval_case::Notes {
what_went_wrong,
what_good_looks_like,
},
include_logs,
logs_bytes,
};
let result =
tokio::task::spawn_blocking(move || codex_eval_case::create_eval_case_bundle(&args))
.await;
let result = match result {
Ok(outcome) => outcome,
Err(join_err) => {
let error = JSONRPCErrorError {
code: INTERNAL_ERROR_CODE,
message: format!("failed to create eval case bundle: {join_err}"),
data: None,
};
self.outgoing.send_error(request_id, error).await;
return;
}
};
match result {
Ok(outcome) => {
let response = EvalCaseCreateResponse {
case_id: outcome.case_id,
path: outcome.path.display().to_string(),
};
self.outgoing.send_response(request_id, response).await;
}
Err(err) => {
let error = JSONRPCErrorError {
code: INTERNAL_ERROR_CODE,
message: format!("failed to create eval case bundle: {err}"),
data: None,
};
self.outgoing.send_error(request_id, error).await;
}
}
}
async fn resolve_rollout_path(&self, conversation_id: ConversationId) -> Option<PathBuf> {
match self
.conversation_manager
@@ -3313,6 +3482,56 @@ impl CodexMessageProcessor {
}
}
fn repo_snapshot_from_rollout(
rollout_text: &str,
start_marker: &codex_eval_case::StartMarker,
) -> Option<codex_eval_case::RepoSnapshot> {
match &start_marker.value {
codex_eval_case::StartMarkerValue::LineIndex(index) => {
let index = usize::try_from(*index).ok()?;
repo_snapshot_from_rollout_line_index(rollout_text, index)
}
codex_eval_case::StartMarkerValue::Timestamp(timestamp) => {
repo_snapshot_from_rollout_timestamp(rollout_text, timestamp)
}
}
}
fn repo_snapshot_from_rollout_timestamp(
rollout_text: &str,
timestamp: &str,
) -> Option<codex_eval_case::RepoSnapshot> {
let start_index = rollout_text.lines().enumerate().find_map(|(idx, line)| {
let rollout_line =
serde_json::from_str::<codex_protocol::protocol::RolloutLine>(line).ok()?;
(rollout_line.timestamp == timestamp).then_some(idx)
})?;
repo_snapshot_from_rollout_line_index(rollout_text, start_index)
}
fn repo_snapshot_from_rollout_line_index(
rollout_text: &str,
start_index: usize,
) -> Option<codex_eval_case::RepoSnapshot> {
rollout_text
.lines()
.enumerate()
.skip(start_index.saturating_add(1))
.find_map(|(_idx, line)| {
let rollout_line =
serde_json::from_str::<codex_protocol::protocol::RolloutLine>(line).ok()?;
match rollout_line.item {
codex_protocol::protocol::RolloutItem::ResponseItem(
codex_protocol::models::ResponseItem::GhostSnapshot { ghost_commit },
) => Some(codex_eval_case::RepoSnapshot {
base_sha: ghost_commit.parent()?.to_string(),
commit_sha: ghost_commit.id().to_string(),
}),
_ => None,
}
})
}
fn skills_to_info(
skills: &[codex_core::skills::SkillMetadata],
) -> Vec<codex_app_server_protocol::SkillMetadata> {
@@ -3619,4 +3838,58 @@ mod tests {
assert_eq!(summary, expected);
Ok(())
}
#[test]
fn repo_snapshot_from_rollout_line_index_finds_next_ghost_snapshot() -> Result<()> {
use serde_json::json;
let user_line = json!({
"timestamp": "2025-09-05T16:53:11.850Z",
"type": "response_item",
"payload": {
"type": "message",
"role": "user",
"content": [{
"type": "input_text",
"text": "hi",
}],
},
});
let ghost_line = json!({
"timestamp": "2025-09-05T16:53:12.000Z",
"type": "response_item",
"payload": {
"type": "ghost_snapshot",
"ghost_commit": {
"id": "ghost-sha",
"parent": "base-sha",
"preexisting_untracked_files": [],
"preexisting_untracked_dirs": [],
},
},
});
let rollout_text = format!(
"{}\n{}\n",
serde_json::to_string(&user_line)?,
serde_json::to_string(&ghost_line)?
);
let start = codex_eval_case::StartMarker {
kind: codex_eval_case::StartMarkerKind::RolloutLineIndex,
value: codex_eval_case::StartMarkerValue::LineIndex(0),
display: "From: hi".to_string(),
};
let snapshot = repo_snapshot_from_rollout(&rollout_text, &start).expect("snapshot");
assert_eq!(
snapshot,
codex_eval_case::RepoSnapshot {
base_sha: "base-sha".to_string(),
commit_sha: "ghost-sha".to_string(),
}
);
Ok(())
}
}

View File

@@ -209,34 +209,19 @@ pub fn apply_hunks(
stdout: &mut impl std::io::Write,
stderr: &mut impl std::io::Write,
) -> Result<(), ApplyPatchError> {
let _existing_paths: Vec<&Path> = hunks
.iter()
.filter_map(|hunk| match hunk {
Hunk::AddFile { .. } => {
// The file is being added, so it doesn't exist yet.
None
}
Hunk::DeleteFile { path } => Some(path.as_path()),
Hunk::UpdateFile {
path, move_path, ..
} => match move_path {
Some(move_path) => {
if std::fs::metadata(move_path)
.map(|m| m.is_file())
.unwrap_or(false)
{
Some(move_path.as_path())
} else {
None
}
}
None => Some(path.as_path()),
},
})
.collect::<Vec<&Path>>();
apply_hunks_in_dir(hunks, Path::new("."), stdout, stderr)
}
/// Applies hunks relative to `cwd` (without mutating the process working directory) and writes the
/// same stdout/stderr output as `apply_hunks`.
pub fn apply_hunks_in_dir(
hunks: &[Hunk],
cwd: &Path,
stdout: &mut impl std::io::Write,
stderr: &mut impl std::io::Write,
) -> Result<(), ApplyPatchError> {
// Delegate to a helper that applies each hunk to the filesystem.
match apply_hunks_to_files(hunks) {
match apply_hunks_to_files_in_dir(hunks, cwd) {
Ok(affected) => {
print_summary(&affected, stdout).map_err(ApplyPatchError::from)?;
Ok(())
@@ -267,7 +252,7 @@ pub struct AffectedPaths {
/// Apply the hunks to the filesystem, returning which files were added, modified, or deleted.
/// Returns an error if the patch could not be applied.
fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result<AffectedPaths> {
fn apply_hunks_to_files_in_dir(hunks: &[Hunk], cwd: &Path) -> anyhow::Result<AffectedPaths> {
if hunks.is_empty() {
anyhow::bail!("No files were modified.");
}
@@ -278,19 +263,29 @@ fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result<AffectedPaths> {
for hunk in hunks {
match hunk {
Hunk::AddFile { path, contents } => {
if let Some(parent) = path.parent()
let target_path = if path.is_absolute() {
path.clone()
} else {
cwd.join(path)
};
if let Some(parent) = target_path.parent()
&& !parent.as_os_str().is_empty()
{
std::fs::create_dir_all(parent).with_context(|| {
format!("Failed to create parent directories for {}", path.display())
})?;
}
std::fs::write(path, contents)
std::fs::write(&target_path, contents)
.with_context(|| format!("Failed to write file {}", path.display()))?;
added.push(path.clone());
}
Hunk::DeleteFile { path } => {
std::fs::remove_file(path)
let target_path = if path.is_absolute() {
path.clone()
} else {
cwd.join(path)
};
std::fs::remove_file(&target_path)
.with_context(|| format!("Failed to delete file {}", path.display()))?;
deleted.push(path.clone());
}
@@ -300,22 +295,38 @@ fn apply_hunks_to_files(hunks: &[Hunk]) -> anyhow::Result<AffectedPaths> {
chunks,
} => {
let AppliedPatch { new_contents, .. } =
derive_new_contents_from_chunks(path, chunks)?;
derive_new_contents_from_chunks_in_dir(path, chunks, cwd)?;
if let Some(dest) = move_path {
if let Some(parent) = dest.parent()
let dest_path = if dest.is_absolute() {
dest.clone()
} else {
cwd.join(dest)
};
if let Some(parent) = dest_path.parent()
&& !parent.as_os_str().is_empty()
{
std::fs::create_dir_all(parent).with_context(|| {
format!("Failed to create parent directories for {}", dest.display())
})?;
}
std::fs::write(dest, new_contents)
std::fs::write(&dest_path, new_contents)
.with_context(|| format!("Failed to write file {}", dest.display()))?;
std::fs::remove_file(path)
let src_path = if path.is_absolute() {
path.clone()
} else {
cwd.join(path)
};
std::fs::remove_file(&src_path)
.with_context(|| format!("Failed to remove original {}", path.display()))?;
modified.push(dest.clone());
} else {
std::fs::write(path, new_contents)
let target_path = if path.is_absolute() {
path.clone()
} else {
cwd.join(path)
};
std::fs::write(&target_path, new_contents)
.with_context(|| format!("Failed to write file {}", path.display()))?;
modified.push(path.clone());
}
@@ -340,7 +351,21 @@ fn derive_new_contents_from_chunks(
path: &Path,
chunks: &[UpdateFileChunk],
) -> std::result::Result<AppliedPatch, ApplyPatchError> {
let original_contents = match std::fs::read_to_string(path) {
derive_new_contents_from_chunks_in_dir(path, chunks, Path::new("."))
}
fn derive_new_contents_from_chunks_in_dir(
path: &Path,
chunks: &[UpdateFileChunk],
cwd: &Path,
) -> std::result::Result<AppliedPatch, ApplyPatchError> {
let target_path = if path.is_absolute() {
path.to_path_buf()
} else {
cwd.join(path)
};
let original_contents = match std::fs::read_to_string(&target_path) {
Ok(contents) => contents,
Err(err) => {
return Err(ApplyPatchError::IoError(IoError {

View File

@@ -343,6 +343,7 @@ pub(crate) struct Session {
/// The set of enabled features should be invariant for the lifetime of the
/// session.
features: Features,
repo_snapshotting_enabled: AtomicBool,
pub(crate) active_turn: Mutex<Option<ActiveTurn>>,
pub(crate) services: SessionServices,
next_internal_sub_id: AtomicU64,
@@ -673,6 +674,7 @@ impl Session {
tx_event: tx_event.clone(),
state: Mutex::new(state),
features: config.features.clone(),
repo_snapshotting_enabled: AtomicBool::new(false),
active_turn: Mutex::new(None),
services,
next_internal_sub_id: AtomicU64::new(0),
@@ -1439,7 +1441,9 @@ impl Session {
turn_context: Arc<TurnContext>,
cancellation_token: CancellationToken,
) {
if !self.enabled(Feature::GhostCommit) {
if !self.enabled(Feature::GhostCommit)
&& !self.repo_snapshotting_enabled.load(Ordering::Relaxed)
{
return;
}
let token = match turn_context.tool_call_gate.subscribe().await {
@@ -1609,6 +1613,9 @@ async fn submission_loop(sess: Arc<Session>, config: Arc<Config>, rx_sub: Receiv
)
.await;
}
Op::SetRepoSnapshotting { enabled } => {
handlers::set_repo_snapshotting(&sess, enabled).await;
}
Op::UserInput { .. } | Op::UserTurn { .. } => {
handlers::user_input_or_turn(&sess, sub.id.clone(), sub.op, &mut previous_context)
.await;
@@ -1707,6 +1714,7 @@ mod handlers {
use mcp_types::RequestId;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::Ordering;
use tracing::info;
use tracing::warn;
@@ -1731,6 +1739,11 @@ mod handlers {
}
}
pub async fn set_repo_snapshotting(sess: &Session, enabled: bool) {
sess.repo_snapshotting_enabled
.store(enabled, Ordering::Relaxed);
}
pub async fn user_input_or_turn(
sess: &Arc<Session>,
sub_id: String,
@@ -3161,6 +3174,7 @@ mod tests {
tx_event,
state: Mutex::new(state),
features: config.features.clone(),
repo_snapshotting_enabled: AtomicBool::new(false),
active_turn: Mutex::new(None),
services,
next_internal_sub_id: AtomicU64::new(0),
@@ -3248,6 +3262,7 @@ mod tests {
tx_event,
state: Mutex::new(state),
features: config.features.clone(),
repo_snapshotting_enabled: AtomicBool::new(false),
active_turn: Mutex::new(None),
services,
next_internal_sub_id: AtomicU64::new(0),

View File

@@ -23,6 +23,7 @@ use codex_protocol::protocol::ReviewDecision;
use futures::future::BoxFuture;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Instant;
#[derive(Clone, Debug)]
pub struct ApplyPatchRequest {
@@ -143,6 +144,48 @@ impl ToolRuntime<ApplyPatchRequest, ExecToolCallOutput> for ApplyPatchRuntime {
attempt: &SandboxAttempt<'_>,
ctx: &ToolCtx<'_>,
) -> Result<ExecToolCallOutput, ToolError> {
// When there is no sandbox in play (DangerFullAccess / bypassed sandbox), apply the patch
// in-process. This avoids relying on the current executable implementing the arg0/argv1
// dispatch behavior (which is not true for unit/integration test binaries).
if attempt.sandbox == crate::exec::SandboxType::None {
let started = Instant::now();
let mut stdout = Vec::new();
let mut stderr = Vec::new();
// Apply the patch relative to `req.cwd` without mutating the process CWD.
let exit_code = match codex_apply_patch::parse_patch(&req.patch) {
Ok(args) => {
match codex_apply_patch::apply_hunks_in_dir(
&args.hunks,
&req.cwd,
&mut stdout,
&mut stderr,
) {
Ok(()) => 0,
Err(_) => 1,
}
}
// Reuse codex-apply-patch's error formatting for parse errors.
Err(_) => {
match codex_apply_patch::apply_patch(&req.patch, &mut stdout, &mut stderr) {
Ok(()) => 0,
Err(_) => 1,
}
}
};
let stdout = String::from_utf8_lossy(&stdout).to_string();
let stderr = String::from_utf8_lossy(&stderr).to_string();
let aggregated_output = format!("{stdout}{stderr}");
return Ok(ExecToolCallOutput {
exit_code,
stdout: crate::exec::StreamOutput::new(stdout),
stderr: crate::exec::StreamOutput::new(stderr),
aggregated_output: crate::exec::StreamOutput::new(aggregated_output),
duration: started.elapsed(),
timed_out: false,
});
}
let spec = Self::build_command_spec(req)?;
let env = attempt
.env_for(spec)

View File

@@ -8,6 +8,7 @@ use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
use codex_protocol::user_input::UserInput;
use core_test_support::responses::ResponsesRequest;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call;
@@ -277,7 +278,7 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> {
ev_completed("resp-2"),
]),
];
mount_sse_sequence(harness.server(), responses).await;
let requests = mount_sse_sequence(harness.server(), responses).await;
let model = test.session_configured.model.clone();
codex
@@ -297,7 +298,14 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> {
wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
assert_eq!(fs::read_to_string(&target).await?, "hello from snapshot\n");
let tool_output = extract_call_output_text(&requests.requests(), call_id);
let target_contents = fs::read_to_string(&target).await.unwrap_or_else(|err| {
panic!(
"expected patch to create {}: {err}; tool output={tool_output:?}",
target.display()
)
});
assert_eq!(target_contents, "hello from snapshot\n");
let mut entries = fs::read_dir(codex_home.join("shell_snapshots")).await?;
let snapshot_path = entries
@@ -311,6 +319,35 @@ async fn shell_command_snapshot_still_intercepts_apply_patch() -> Result<()> {
Ok(())
}
fn extract_call_output_text(requests: &[ResponsesRequest], call_id: &str) -> String {
for req in requests {
let input = req.input();
let item = input.iter().find(|item| {
item.get("type").and_then(serde_json::Value::as_str) == Some("function_call_output")
&& item.get("call_id").and_then(serde_json::Value::as_str) == Some(call_id)
});
let Some(item) = item else {
continue;
};
let output = item
.get("output")
.cloned()
.unwrap_or(serde_json::Value::Null);
match output {
serde_json::Value::String(text) => return text,
serde_json::Value::Object(obj) => {
if let Some(text) = obj.get("content").and_then(serde_json::Value::as_str) {
return text.to_string();
}
}
_ => {}
}
}
"<missing tool output>".to_string()
}
#[cfg_attr(target_os = "windows", ignore)]
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn shell_snapshot_deleted_after_shutdown_with_skills() -> Result<()> {

View File

@@ -0,0 +1,44 @@
# Eval Capture Bundles
Codex can capture "eval case" bundles from the `/feedback` flow (bad result -> capture eval sample).
These bundles are meant to turn real failures into reproducible, local-first artifacts.
## Where Bundles Are Written
Bundles are stored under:
`$CODEX_HOME/eval-case/<case-id>/`
## Bundle Contents
Each bundle contains:
- `manifest.json` - metadata about the capture (schema version, start marker, notes, repo base).
- `rollout.jsonl` - the full session rollout (multi-turn trajectory).
- `repo.patch` - a git patch representing the repository state at the chosen start marker.
- `codex-logs.log` - tracing logs to help maintainers debug the session.
## Start Marker And Repo State
Bundles include the entire rollout, but also record a start marker to indicate where an eval
harness (or a human) should begin replaying/interpreting the trajectory.
The repository patch must match that chosen start marker:
- If the session has repo snapshots available, `repo.patch` is derived from the ghost snapshot
commit associated with the selected user turn (diff from the snapshot's base commit to the
snapshot commit).
- If no snapshot is available for a given start marker, the TUI disables that option (and may
fall back to the basic feedback flow instead).
For reproducibility outside your machine, the base commit recorded in `manifest.json` should be
reachable by maintainers (for example, pushed and available on the default branch).
## App-Server API (For Integrations)
Non-TUI clients can create bundles via the app-server JSON-RPC method:
- `evalCase/create`
The handler copies the rollout into the bundle and derives `repo.patch` based on the selected start
marker when repo snapshots are available.

View File

@@ -0,0 +1,20 @@
[package]
name = "codex-eval-case"
version.workspace = true
edition.workspace = true
license.workspace = true
[lints]
workspace = true
[dependencies]
anyhow = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
time = { workspace = true, features = ["formatting"] }
uuid = { workspace = true, features = ["v4"] }
[dev-dependencies]
pretty_assertions = { workspace = true }
tempfile = { workspace = true }

View File

@@ -0,0 +1,516 @@
use std::path::Path;
use std::path::PathBuf;
use std::process::Command;
use anyhow::Context as _;
use serde::Deserialize;
use serde::Serialize;
use time::OffsetDateTime;
use time::format_description::well_known::Rfc3339;
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum StartMarkerKind {
RolloutLineTimestamp,
RolloutLineIndex,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(untagged)]
pub enum StartMarkerValue {
Timestamp(String),
LineIndex(u64),
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct StartMarker {
pub kind: StartMarkerKind,
pub value: StartMarkerValue,
pub display: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct GitBase {
pub sha: String,
pub note: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RolloutInfo {
pub filename: String,
pub start: StartMarker,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RepoInfo {
pub cwd: String,
pub git_base: GitBase,
pub patch_filename: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct Notes {
pub what_went_wrong: String,
pub what_good_looks_like: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct Artifacts {
pub include_logs: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EvalCaseManifestV0 {
pub version: String,
pub case_id: String,
pub created_at: String,
pub conversation_id: String,
pub source: String,
pub rollout: RolloutInfo,
pub repo: RepoInfo,
pub notes: Notes,
pub artifacts: Artifacts,
}
#[derive(Debug, Clone)]
pub struct CreateEvalCaseArgs {
pub codex_home: PathBuf,
pub conversation_id: String,
pub rollout_path: PathBuf,
pub start: StartMarker,
pub repo_cwd: PathBuf,
/// When present, derive `repo.patch` from the provided commit snapshot instead of the current
/// working tree.
pub repo_snapshot: Option<RepoSnapshot>,
pub notes: Notes,
pub include_logs: bool,
pub logs_bytes: Option<Vec<u8>>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RepoSnapshot {
pub base_sha: String,
pub commit_sha: String,
}
#[derive(Debug, Clone)]
pub struct CreateEvalCaseResult {
pub case_id: String,
pub path: PathBuf,
}
fn sanitize_repo_slug(input: &str) -> String {
// Keep it short + filesystem-friendly.
let mut out = String::with_capacity(input.len());
let mut last_was_dash = false;
for ch in input.chars() {
let ch = ch.to_ascii_lowercase();
if ch.is_ascii_alphanumeric() {
out.push(ch);
last_was_dash = false;
continue;
}
if !last_was_dash {
out.push('-');
last_was_dash = true;
}
}
let out = out.trim_matches('-').to_string();
if out.is_empty() {
"repo".to_string()
} else {
out
}
}
fn repo_slug(repo_cwd: &Path) -> String {
// Prefer the git repo root name, but fall back to the cwd basename.
let top_level = git_stdout(repo_cwd, &["rev-parse", "--show-toplevel"])
.ok()
.map(PathBuf::from);
let basename = top_level
.as_ref()
.and_then(|p| p.file_name())
.or_else(|| repo_cwd.file_name());
let basename = basename
.map(|name| name.to_string_lossy().to_string())
.unwrap_or_else(|| "repo".to_string());
sanitize_repo_slug(&basename)
}
pub fn create_eval_case_bundle(args: &CreateEvalCaseArgs) -> anyhow::Result<CreateEvalCaseResult> {
let created_at = OffsetDateTime::now_utc();
let created_at_rfc3339 = created_at.format(&Rfc3339).context("format created_at")?;
let ts_for_id = format!(
"{:04}-{:02}-{:02}T{:02}-{:02}-{:02}",
created_at.year(),
u8::from(created_at.month()),
created_at.day(),
created_at.hour(),
created_at.minute(),
created_at.second()
);
// Short, human-scannable id: datetime + repo + 6-digit suffix.
// Collision risk is low and acceptable for local bundles.
let repo = repo_slug(&args.repo_cwd);
let id6 = (Uuid::new_v4().as_u128() % 1_000_000) as u32;
let case_id = format!("{ts_for_id}-{repo}-{id6:06}");
let bundle_dir = args.codex_home.join("eval-case").join(&case_id);
std::fs::create_dir_all(&bundle_dir)
.with_context(|| format!("create eval bundle dir {}", bundle_dir.display()))?;
let rollout_dst = bundle_dir.join("rollout.jsonl");
std::fs::copy(&args.rollout_path, &rollout_dst).with_context(|| {
format!(
"copy rollout {} -> {}",
args.rollout_path.display(),
rollout_dst.display()
)
})?;
let (base_sha, patch) = match args.repo_snapshot.as_ref() {
Some(snapshot) => {
git_patch_between_commits(&args.repo_cwd, &snapshot.base_sha, &snapshot.commit_sha)
.with_context(|| {
format!(
"generate patch for repo snapshot {}..{}",
snapshot.base_sha, snapshot.commit_sha
)
})?
}
None => git_patch_against_head(&args.repo_cwd)?,
};
let patch_path = bundle_dir.join("repo.patch");
std::fs::write(&patch_path, patch)
.with_context(|| format!("write patch {}", patch_path.display()))?;
if args.include_logs {
let logs_path = bundle_dir.join("codex-logs.log");
let bytes = args
.logs_bytes
.clone()
.unwrap_or_else(|| Vec::with_capacity(0));
std::fs::write(&logs_path, bytes)
.with_context(|| format!("write logs {}", logs_path.display()))?;
}
let manifest = EvalCaseManifestV0 {
version: "v0".to_string(),
case_id: case_id.clone(),
created_at: created_at_rfc3339,
conversation_id: args.conversation_id.clone(),
source: "cli".to_string(),
rollout: RolloutInfo {
filename: "rollout.jsonl".to_string(),
start: args.start.clone(),
},
repo: RepoInfo {
cwd: args.repo_cwd.display().to_string(),
git_base: GitBase {
sha: base_sha,
note: "For reproducibility, the base commit should be reachable (e.g. pushed / on main)."
.to_string(),
},
patch_filename: "repo.patch".to_string(),
},
notes: args.notes.clone(),
artifacts: Artifacts {
include_logs: args.include_logs,
},
};
let manifest_path = bundle_dir.join("manifest.json");
let manifest_json = serde_json::to_string_pretty(&manifest).context("serialize manifest")?;
std::fs::write(&manifest_path, format!("{manifest_json}\n"))
.with_context(|| format!("write manifest {}", manifest_path.display()))?;
Ok(CreateEvalCaseResult {
case_id,
path: bundle_dir,
})
}
fn git_patch_against_head(repo_cwd: &Path) -> anyhow::Result<(String, Vec<u8>)> {
let base_sha =
git_stdout(repo_cwd, &["rev-parse", "HEAD"]).unwrap_or_else(|_| "unknown".to_string());
let mut patch = Vec::new();
if let Ok(mut bytes) = git_diff(
repo_cwd,
&["diff", "--no-textconv", "--no-ext-diff", "--binary", "HEAD"],
) {
patch.append(&mut bytes);
}
let untracked =
git_stdout(repo_cwd, &["ls-files", "--others", "--exclude-standard"]).unwrap_or_default();
for file in untracked.lines().map(str::trim).filter(|s| !s.is_empty()) {
let null_device = if cfg!(windows) { "NUL" } else { "/dev/null" };
let args = [
"diff",
"--no-textconv",
"--no-ext-diff",
"--binary",
"--no-index",
"--",
null_device,
file,
];
if let Ok(mut bytes) = git_diff(repo_cwd, &args) {
patch.append(&mut bytes);
}
}
Ok((base_sha, patch))
}
fn git_stdout(repo_cwd: &Path, args: &[&str]) -> anyhow::Result<String> {
let output = Command::new("git")
.args(args)
.current_dir(repo_cwd)
.output()
.with_context(|| format!("run git {}", args.join(" ")))?;
if !output.status.success() {
anyhow::bail!("git {} failed with {}", args.join(" "), output.status);
}
let out = String::from_utf8(output.stdout).context("decode git stdout")?;
Ok(out.trim().to_string())
}
fn git_diff(repo_cwd: &Path, args: &[&str]) -> anyhow::Result<Vec<u8>> {
let output = Command::new("git")
.args(args)
.current_dir(repo_cwd)
.output()
.with_context(|| format!("run git {}", args.join(" ")))?;
let exit_ok = output.status.code().is_some_and(|c| c == 0 || c == 1);
if !exit_ok {
anyhow::bail!("git {} failed with {}", args.join(" "), output.status);
}
Ok(output.stdout)
}
fn git_patch_between_commits(
repo_cwd: &Path,
base_sha: &str,
commit_sha: &str,
) -> anyhow::Result<(String, Vec<u8>)> {
let patch = git_diff(
repo_cwd,
&[
"diff",
"--no-textconv",
"--no-ext-diff",
"--binary",
base_sha,
commit_sha,
],
)?;
Ok((base_sha.to_string(), patch))
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use tempfile::TempDir;
#[test]
fn creates_bundle_with_manifest_and_rollout() {
let codex_home = TempDir::new().unwrap();
let repo_dir = TempDir::new().unwrap();
let repo_root = repo_dir.path().join("my-repo");
std::fs::create_dir_all(&repo_root).unwrap();
std::fs::write(repo_root.join("README.md"), "hi\n").unwrap();
let init_status = Command::new("git")
.args(["init", "-q"])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(init_status.success());
let add_status = Command::new("git")
.args(["add", "."])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(add_status.success());
let commit_status = Command::new("git")
.args([
"-c",
"user.name=codex",
"-c",
"user.email=codex@example.com",
"commit",
"-m",
"init",
"-q",
])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(commit_status.success());
std::fs::write(repo_root.join("README.md"), "changed\n").unwrap();
let rollout_path = repo_root.join("rollout.jsonl");
std::fs::write(&rollout_path, "line-1\nline-2\n").unwrap();
let args = CreateEvalCaseArgs {
codex_home: codex_home.path().to_path_buf(),
conversation_id: "conv-1".to_string(),
rollout_path,
start: StartMarker {
kind: StartMarkerKind::RolloutLineIndex,
value: StartMarkerValue::LineIndex(1),
display: "Start now".to_string(),
},
repo_cwd: repo_root.clone(),
repo_snapshot: None,
notes: Notes {
what_went_wrong: "bad".to_string(),
what_good_looks_like: "good".to_string(),
},
include_logs: true,
logs_bytes: Some(b"logs".to_vec()),
};
let out = create_eval_case_bundle(&args).unwrap();
assert!(!out.case_id.is_empty());
assert!(out.path.exists());
assert_eq!(out.path.file_name().unwrap(), out.case_id.as_str());
assert!(out.path.starts_with(codex_home.path().join("eval-case")));
assert!(out.case_id.contains("my-repo"));
let manifest_text = std::fs::read_to_string(out.path.join("manifest.json")).unwrap();
let manifest: EvalCaseManifestV0 = serde_json::from_str(&manifest_text).unwrap();
assert_eq!(manifest.version, "v0");
assert_eq!(manifest.conversation_id, "conv-1");
assert_eq!(manifest.notes, args.notes);
assert!(manifest.repo.git_base.sha != "unknown");
assert_eq!(manifest.artifacts.include_logs, true);
assert!(out.path.join("repo.patch").exists());
assert!(out.path.join("rollout.jsonl").exists());
assert_eq!(
std::fs::read(out.path.join("codex-logs.log")).unwrap(),
b"logs".to_vec()
);
}
#[test]
fn creates_bundle_from_repo_snapshot_commit() {
let codex_home = TempDir::new().unwrap();
let repo_dir = TempDir::new().unwrap();
let repo_root = repo_dir.path().join("my-repo");
std::fs::create_dir_all(&repo_root).unwrap();
std::fs::write(repo_root.join("README.md"), "base\n").unwrap();
let init_status = Command::new("git")
.args(["init", "-q"])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(init_status.success());
let add_status = Command::new("git")
.args(["add", "."])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(add_status.success());
let commit_status = Command::new("git")
.args([
"-c",
"user.name=codex",
"-c",
"user.email=codex@example.com",
"commit",
"-m",
"base",
"-q",
])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(commit_status.success());
let base_sha = git_stdout(&repo_root, &["rev-parse", "HEAD"]).unwrap();
// Create a snapshot commit.
std::fs::write(repo_root.join("README.md"), "snapshot\n").unwrap();
std::fs::write(repo_root.join("snap.txt"), "snap\n").unwrap();
let add_status = Command::new("git")
.args(["add", "."])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(add_status.success());
let commit_status = Command::new("git")
.args([
"-c",
"user.name=codex",
"-c",
"user.email=codex@example.com",
"commit",
"-m",
"snapshot",
"-q",
])
.current_dir(&repo_root)
.status()
.unwrap();
assert!(commit_status.success());
let snapshot_sha = git_stdout(&repo_root, &["rev-parse", "HEAD"]).unwrap();
// Dirty the working tree after the snapshot commit; this should not affect repo.patch.
std::fs::write(repo_root.join("README.md"), "worktree\n").unwrap();
let rollout_path = repo_root.join("rollout.jsonl");
std::fs::write(&rollout_path, "line-1\nline-2\n").unwrap();
let args = CreateEvalCaseArgs {
codex_home: codex_home.path().to_path_buf(),
conversation_id: "conv-2".to_string(),
rollout_path,
start: StartMarker {
kind: StartMarkerKind::RolloutLineIndex,
value: StartMarkerValue::LineIndex(0),
display: "From: test".to_string(),
},
repo_cwd: repo_root.clone(),
repo_snapshot: Some(RepoSnapshot {
base_sha: base_sha.clone(),
commit_sha: snapshot_sha,
}),
notes: Notes {
what_went_wrong: "bad".to_string(),
what_good_looks_like: "good".to_string(),
},
include_logs: false,
logs_bytes: None,
};
let out = create_eval_case_bundle(&args).unwrap();
assert_eq!(out.path.file_name().unwrap(), out.case_id.as_str());
assert!(out.path.starts_with(codex_home.path().join("eval-case")));
assert!(out.case_id.contains("my-repo"));
let manifest_text = std::fs::read_to_string(out.path.join("manifest.json")).unwrap();
let manifest: EvalCaseManifestV0 = serde_json::from_str(&manifest_text).unwrap();
assert_eq!(manifest.repo.git_base.sha, base_sha);
let patch_text =
String::from_utf8(std::fs::read(out.path.join("repo.patch")).unwrap()).unwrap();
assert!(
patch_text.contains("snapshot"),
"patch should include snapshot commit changes"
);
assert!(
!patch_text.contains("worktree"),
"patch should not include post-snapshot working tree changes"
);
assert!(patch_text.contains("snap.txt"));
}
}

View File

@@ -155,8 +155,28 @@ pub struct CodexLogSnapshot {
pub thread_id: String,
}
pub struct FeedbackAttachment {
pub filename: String,
pub content_type: Option<String>,
pub bytes: Vec<u8>,
}
impl FeedbackAttachment {
pub fn new(filename: String, content_type: Option<String>, bytes: Vec<u8>) -> Self {
Self {
filename,
content_type,
bytes,
}
}
}
impl CodexLogSnapshot {
pub(crate) fn as_bytes(&self) -> &[u8] {
pub fn from_bytes(thread_id: String, bytes: Vec<u8>) -> Self {
Self { bytes, thread_id }
}
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
}
@@ -176,6 +196,25 @@ impl CodexLogSnapshot {
include_logs: bool,
rollout_path: Option<&std::path::Path>,
session_source: Option<SessionSource>,
) -> Result<()> {
self.upload_feedback_with_attachments(
classification,
reason,
include_logs,
rollout_path,
session_source,
Vec::new(),
)
}
pub fn upload_feedback_with_attachments(
&self,
classification: &str,
reason: Option<&str>,
include_logs: bool,
rollout_path: Option<&std::path::Path>,
session_source: Option<SessionSource>,
extra_attachments: Vec<FeedbackAttachment>,
) -> Result<()> {
use std::collections::BTreeMap;
use std::fs;
@@ -265,6 +304,15 @@ impl CodexLogSnapshot {
}));
}
for attachment in extra_attachments {
envelope.add_item(EnvelopeItem::Attachment(Attachment {
buffer: attachment.bytes,
filename: attachment.filename,
content_type: attachment.content_type,
ty: None,
}));
}
client.send_envelope(envelope);
client.flush(Some(Duration::from_secs(UPLOAD_TIMEOUT_SECS)));
Ok(())

View File

@@ -141,6 +141,11 @@ pub enum Op {
summary: Option<ReasoningSummaryConfig>,
},
/// Enable or disable repository snapshots (ghost commits) for the remainder of the session.
///
/// When enabled, Codex will capture per-turn git snapshots and record them in the rollout.
SetRepoSnapshotting { enabled: bool },
/// Approve a command execution
ExecApproval {
/// The id of the submission we are approving