Files
codex/codex-rs/rollout-trace/src/compaction.rs
cassirer-openai 6d09b6752d [rollout_trace] Trace tool and code-mode boundaries (#18878)
## Summary

Extends rollout tracing across tool dispatch and code-mode runtime
boundaries. This records canonical tool-call lifecycle events and links
code-mode execution/wait operations back to the model-visible calls that
caused them.

## Stack

This is PR 3/5 in the rollout trace stack.

- [#18876](https://github.com/openai/codex/pull/18876): Add rollout
trace crate
- [#18877](https://github.com/openai/codex/pull/18877): Record core
session rollout traces
- [#18878](https://github.com/openai/codex/pull/18878): Trace tool and
code-mode boundaries
- [#18879](https://github.com/openai/codex/pull/18879): Trace sessions
and multi-agent edges
- [#18880](https://github.com/openai/codex/pull/18880): Add debug trace
reduction command

## Review Notes

This PR is about attribution. Reviewers should focus on whether direct
tool calls, code-mode-originated tool calls, waits, outputs, and
cancellation boundaries are recorded with enough source information for
deterministic reduction without coupling the reducer to live runtime
internals.

The stack remains valid after this layer: tool and code-mode traces
reduce through the existing crate model, while the broader session and
multi-agent relationships are added in the next PR.
2026-04-23 12:22:11 -07:00

280 lines
9.4 KiB
Rust

//! Hot-path helpers for recording upstream remote compaction attempts.
//!
//! Remote compaction is a model-facing request with a different semantic role
//! from normal sampling. Keeping the no-op capable trace handle in this crate
//! lets `codex-core` record exact endpoint payloads without owning trace schema
//! details.
use std::fmt::Display;
use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use codex_protocol::models::ResponseItem;
use serde::Serialize;
use serde_json::Value as JsonValue;
use tracing::warn;
use crate::inference::trace_response_item_json;
use crate::model::AgentThreadId;
use crate::model::CodexTurnId;
use crate::model::CompactionId;
use crate::model::CompactionRequestId;
use crate::payload::RawPayloadKind;
use crate::raw_event::RawTraceEventContext;
use crate::raw_event::RawTraceEventPayload;
use crate::writer::TraceWriter;
static NEXT_COMPACTION_REQUEST: AtomicU64 = AtomicU64::new(1);
/// Turn-local remote compaction tracing context.
///
/// A compaction can retry its upstream request before installing one checkpoint. The context
/// owns the stable checkpoint ID; each request attempt gets a separate request ID.
#[derive(Clone, Debug)]
pub struct CompactionTraceContext {
state: CompactionTraceContextState,
}
#[derive(Clone, Debug)]
enum CompactionTraceContextState {
Disabled,
Enabled(EnabledCompactionTraceContext),
}
#[derive(Clone, Debug)]
struct EnabledCompactionTraceContext {
writer: Arc<TraceWriter>,
thread_id: AgentThreadId,
codex_turn_id: CodexTurnId,
compaction_id: CompactionId,
model: String,
provider_name: String,
}
/// One upstream request attempt made while computing a compaction checkpoint.
#[derive(Clone, Debug)]
pub struct CompactionTraceAttempt {
state: CompactionTraceAttemptState,
}
#[derive(Clone, Debug)]
enum CompactionTraceAttemptState {
Disabled,
Enabled(EnabledCompactionTraceAttempt),
}
#[derive(Clone, Debug)]
struct EnabledCompactionTraceAttempt {
context: EnabledCompactionTraceContext,
compaction_request_id: CompactionRequestId,
}
#[derive(Serialize)]
struct TracedCompactionCompleted {
output_items: Vec<JsonValue>,
}
/// History replacement checkpoint persisted when compaction installs new live history.
///
/// The checkpoint keeps compaction separate from ordinary sampling snapshots:
/// `input_history` is the live thread history selected for compaction, while
/// `replacement_history` is what future prompts may carry after the checkpoint.
#[derive(Serialize)]
pub struct CompactionCheckpointTracePayload<'a> {
pub input_history: &'a [ResponseItem],
pub replacement_history: &'a [ResponseItem],
}
impl CompactionTraceContext {
/// Builds a context that accepts trace calls and records nothing.
pub fn disabled() -> Self {
Self {
state: CompactionTraceContextState::Disabled,
}
}
/// Builds an enabled context for upstream attempts that compute one checkpoint.
pub fn enabled(
writer: Arc<TraceWriter>,
thread_id: AgentThreadId,
codex_turn_id: CodexTurnId,
compaction_id: CompactionId,
model: String,
provider_name: String,
) -> Self {
Self {
state: CompactionTraceContextState::Enabled(EnabledCompactionTraceContext {
writer,
thread_id,
codex_turn_id,
compaction_id,
model,
provider_name,
}),
}
}
/// Starts a new upstream attempt and records the exact compact endpoint request.
pub fn start_attempt(&self, request: &impl Serialize) -> CompactionTraceAttempt {
let CompactionTraceContextState::Enabled(context) = &self.state else {
return CompactionTraceAttempt::disabled();
};
let attempt = CompactionTraceAttempt {
state: CompactionTraceAttemptState::Enabled(EnabledCompactionTraceAttempt {
context: context.clone(),
compaction_request_id: next_compaction_request_id(),
}),
};
attempt.record_started(request);
attempt
}
/// Records the point where compacted history becomes the live thread history.
///
/// The checkpoint belongs to the same semantic compaction lifecycle as the
/// compact endpoint attempts, so the context reuses its stable compaction ID.
pub fn record_installed(&self, checkpoint: &CompactionCheckpointTracePayload<'_>) {
let CompactionTraceContextState::Enabled(context) = &self.state else {
return;
};
let checkpoint_payload = match context
.writer
.write_json_payload(RawPayloadKind::CompactionCheckpoint, checkpoint)
{
Ok(payload_ref) => payload_ref,
Err(err) => {
warn!("failed to write rollout trace payload: {err:#}");
return;
}
};
let event_context = RawTraceEventContext {
thread_id: Some(context.thread_id.clone()),
codex_turn_id: Some(context.codex_turn_id.clone()),
};
if let Err(err) = context.writer.append_with_context(
event_context,
RawTraceEventPayload::CompactionInstalled {
compaction_id: context.compaction_id.clone(),
checkpoint_payload,
},
) {
warn!("failed to append rollout trace event: {err:#}");
}
}
}
impl CompactionTraceAttempt {
/// Builds an attempt that records nothing.
fn disabled() -> Self {
Self {
state: CompactionTraceAttemptState::Disabled,
}
}
fn record_started(&self, request: &impl Serialize) {
let CompactionTraceAttemptState::Enabled(attempt) = &self.state else {
return;
};
let Some(request_payload) = write_json_payload_best_effort(
&attempt.context.writer,
RawPayloadKind::CompactionRequest,
request,
) else {
return;
};
append_with_context_best_effort(
&attempt.context,
RawTraceEventPayload::CompactionRequestStarted {
compaction_id: attempt.context.compaction_id.clone(),
compaction_request_id: attempt.compaction_request_id.clone(),
thread_id: attempt.context.thread_id.clone(),
codex_turn_id: attempt.context.codex_turn_id.clone(),
model: attempt.context.model.clone(),
provider_name: attempt.context.provider_name.clone(),
request_payload,
},
);
}
/// Records the non-streaming compact endpoint response payload.
///
/// Compaction responses use the same response-item preservation rules as
/// inference streams: traces are evidence, while normal ResponseItem
/// serialization is shaped for future request construction.
pub fn record_completed(&self, output_items: &[ResponseItem]) {
let CompactionTraceAttemptState::Enabled(attempt) = &self.state else {
return;
};
let response_payload = TracedCompactionCompleted {
output_items: output_items.iter().map(trace_response_item_json).collect(),
};
let Some(response_payload) = write_json_payload_best_effort(
&attempt.context.writer,
RawPayloadKind::CompactionResponse,
&response_payload,
) else {
return;
};
append_with_context_best_effort(
&attempt.context,
RawTraceEventPayload::CompactionRequestCompleted {
compaction_id: attempt.context.compaction_id.clone(),
compaction_request_id: attempt.compaction_request_id.clone(),
response_payload,
},
);
}
/// Records the compact endpoint result without forcing callers to branch on trace events.
pub fn record_result<E: Display>(&self, result: Result<&[ResponseItem], E>) {
match result {
Ok(output_items) => self.record_completed(output_items),
Err(err) => self.record_failed(err),
}
}
/// Records pre-response failures from the compact endpoint.
pub fn record_failed(&self, error: impl Display) {
let CompactionTraceAttemptState::Enabled(attempt) = &self.state else {
return;
};
append_with_context_best_effort(
&attempt.context,
RawTraceEventPayload::CompactionRequestFailed {
compaction_id: attempt.context.compaction_id.clone(),
compaction_request_id: attempt.compaction_request_id.clone(),
error: error.to_string(),
},
);
}
}
fn next_compaction_request_id() -> CompactionRequestId {
let ordinal = NEXT_COMPACTION_REQUEST.fetch_add(1, Ordering::Relaxed);
format!("compaction_request:{ordinal}")
}
fn write_json_payload_best_effort(
writer: &TraceWriter,
kind: RawPayloadKind,
payload: &impl Serialize,
) -> Option<crate::RawPayloadRef> {
writer.write_json_payload(kind, payload).ok()
}
fn append_with_context_best_effort(
context: &EnabledCompactionTraceContext,
payload: RawTraceEventPayload,
) {
let event_context = RawTraceEventContext {
thread_id: Some(context.thread_id.clone()),
codex_turn_id: Some(context.codex_turn_id.clone()),
};
let _ = context.writer.append_with_context(event_context, payload);
}