feat(app-server): propagate traces across tasks and core ops (#14387)

## Summary

This PR keeps app-server RPC request trace context alive for the full
lifetime of the work that request kicks off (e.g. for `thread/start`,
this is `app-server rpc handler -> tokio background task -> core op
submissions`). Previously we lose trace lineage once the request handler
returns or hands work off to background tasks.

This approach is especially relevant for `thread/start` and other RPC
handlers that run in a non-blocking way. In the near future we'll most
likely want to make all app-server handlers run in a non-blocking way by
default, and only queue operations that must operate in order (e.g.
thread RPCs per thread?), so we want to make sure tracing in app-server
just generally works.

Depends on https://github.com/openai/codex/pull/14300

**Before**
<img width="155" height="207" alt="image"
src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af"
/>


**After**
<img width="299" height="337" alt="image"
src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea"
/>

## What changed

- Keep request-scoped trace context around until we send the final
response or error, or the connection closes.
- Thread that trace context through detached `thread/start` work so
background startup stays attached to the originating request.
- Pass request trace context through to downstream core operations,
including:
  - thread creation
  - resume/fork flows
  - turn submission
  - review
  - interrupt
  - realtime conversation operations
- Add tracing tests that verify:
  - remote W3C trace context is preserved for `thread/start`
  - remote W3C trace context is preserved for `turn/start`
  - downstream core spans stay under the originating request span
  - request-scoped tracing state is cleaned up correctly
- Clean up shutdown behavior so detached background tasks and spawned
threads are drained before process exit.
This commit is contained in:
Owen Lin
2026-03-11 20:18:31 -07:00
committed by GitHub
parent bf5e997b31
commit 5bc82c5b93
24 changed files with 1524 additions and 308 deletions

View File

@@ -104,6 +104,7 @@ use codex_protocol::protocol::TurnAbortReason;
use codex_protocol::protocol::TurnContextItem;
use codex_protocol::protocol::TurnContextNetworkItem;
use codex_protocol::protocol::TurnStartedEvent;
use codex_protocol::protocol::W3cTraceContext;
use codex_protocol::request_permissions::PermissionGrantScope;
use codex_protocol::request_permissions::RequestPermissionsArgs;
use codex_protocol::request_permissions::RequestPermissionsEvent;
@@ -118,6 +119,7 @@ use codex_utils_stream_parser::ProposedPlanSegment;
use codex_utils_stream_parser::extract_proposed_plan_text;
use codex_utils_stream_parser::strip_citations;
use futures::future::BoxFuture;
use futures::future::Shared;
use futures::prelude::*;
use futures::stream::FuturesOrdered;
use rmcp::model::ListResourceTemplatesResult;
@@ -330,8 +332,13 @@ pub struct Codex {
// Last known status of the agent.
pub(crate) agent_status: watch::Receiver<AgentStatus>,
pub(crate) session: Arc<Session>,
// Shared future for the background submission loop completion so multiple
// callers can wait for shutdown.
pub(crate) session_loop_termination: SessionLoopTermination,
}
pub(crate) type SessionLoopTermination = Shared<BoxFuture<'static, ()>>;
/// Wrapper returned by [`Codex::spawn`] containing the spawned [`Codex`],
/// the submission id for the initial `ConfigureSession` request and the
/// unique session id.
@@ -342,6 +349,24 @@ pub struct CodexSpawnOk {
pub conversation_id: ThreadId,
}
pub(crate) struct CodexSpawnArgs {
pub(crate) config: Config,
pub(crate) auth_manager: Arc<AuthManager>,
pub(crate) models_manager: Arc<ModelsManager>,
pub(crate) skills_manager: Arc<SkillsManager>,
pub(crate) plugins_manager: Arc<PluginsManager>,
pub(crate) mcp_manager: Arc<McpManager>,
pub(crate) file_watcher: Arc<FileWatcher>,
pub(crate) conversation_history: InitialHistory,
pub(crate) session_source: SessionSource,
pub(crate) agent_control: AgentControl,
pub(crate) dynamic_tools: Vec<DynamicToolSpec>,
pub(crate) persist_extended_history: bool,
pub(crate) metrics_service_name: Option<String>,
pub(crate) inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
pub(crate) parent_trace: Option<W3cTraceContext>,
}
pub(crate) const INITIAL_SUBMIT_ID: &str = "";
pub(crate) const SUBMISSION_CHANNEL_CAPACITY: usize = 512;
const CYBER_VERIFY_URL: &str = "https://chatgpt.com/cyber";
@@ -349,23 +374,48 @@ const CYBER_SAFETY_URL: &str = "https://developers.openai.com/codex/concepts/cyb
impl Codex {
/// Spawn a new [`Codex`] and initialize the session.
#[allow(clippy::too_many_arguments)]
pub(crate) async fn spawn(
mut config: Config,
auth_manager: Arc<AuthManager>,
models_manager: Arc<ModelsManager>,
skills_manager: Arc<SkillsManager>,
plugins_manager: Arc<PluginsManager>,
mcp_manager: Arc<McpManager>,
file_watcher: Arc<FileWatcher>,
conversation_history: InitialHistory,
session_source: SessionSource,
agent_control: AgentControl,
dynamic_tools: Vec<DynamicToolSpec>,
persist_extended_history: bool,
metrics_service_name: Option<String>,
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
) -> CodexResult<CodexSpawnOk> {
pub(crate) async fn spawn(args: CodexSpawnArgs) -> CodexResult<CodexSpawnOk> {
let parent_trace = match args.parent_trace {
Some(trace) => {
if codex_otel::context_from_w3c_trace_context(&trace).is_some() {
Some(trace)
} else {
warn!("ignoring invalid thread spawn trace carrier");
None
}
}
None => None,
};
let thread_spawn_span = info_span!("thread_spawn", otel.name = "thread_spawn");
if let Some(trace) = parent_trace.as_ref() {
let _ = set_parent_from_w3c_trace_context(&thread_spawn_span, trace);
}
Self::spawn_internal(CodexSpawnArgs {
parent_trace,
..args
})
.instrument(thread_spawn_span)
.await
}
async fn spawn_internal(args: CodexSpawnArgs) -> CodexResult<CodexSpawnOk> {
let CodexSpawnArgs {
mut config,
auth_manager,
models_manager,
skills_manager,
plugins_manager,
mcp_manager,
file_watcher,
conversation_history,
session_source,
agent_control,
dynamic_tools,
persist_extended_history,
metrics_service_name,
inherited_shell_snapshot,
parent_trace: _,
} = args;
let (tx_sub, rx_sub) = async_channel::bounded(SUBMISSION_CHANNEL_CAPACITY);
let (tx_event, rx_event) = async_channel::unbounded();
@@ -557,15 +607,18 @@ impl Codex {
let thread_id = session.conversation_id;
// This task will run until Op::Shutdown is received.
let session_loop_span = info_span!("session_loop", thread_id = %thread_id);
tokio::spawn(
submission_loop(Arc::clone(&session), config, rx_sub).instrument(session_loop_span),
);
let session_for_loop = Arc::clone(&session);
let session_loop_handle = tokio::spawn(async move {
submission_loop(session_for_loop, config, rx_sub)
.instrument(info_span!("session_loop", thread_id = %thread_id))
.await;
});
let codex = Codex {
tx_sub,
rx_event,
agent_status: agent_status_rx,
session,
session_loop_termination: session_loop_termination_from_handle(session_loop_handle),
};
#[allow(deprecated)]
@@ -578,11 +631,19 @@ impl Codex {
/// Submit the `op` wrapped in a `Submission` with a unique ID.
pub async fn submit(&self, op: Op) -> CodexResult<String> {
self.submit_with_trace(op, None).await
}
pub async fn submit_with_trace(
&self,
op: Op,
trace: Option<W3cTraceContext>,
) -> CodexResult<String> {
let id = Uuid::now_v7().to_string();
let sub = Submission {
id: id.clone(),
op,
trace: None,
trace,
};
self.submit_with_id(sub).await?;
Ok(id)
@@ -601,6 +662,17 @@ impl Codex {
Ok(())
}
pub async fn shutdown_and_wait(&self) -> CodexResult<()> {
let session_loop_termination = self.session_loop_termination.clone();
match self.submit(Op::Shutdown).await {
Ok(_) => {}
Err(CodexErr::InternalAgentDied) => {}
Err(err) => return Err(err),
}
session_loop_termination.await;
Ok(())
}
pub async fn next_event(&self) -> CodexResult<Event> {
let event = self
.rx_event
@@ -648,6 +720,21 @@ impl Codex {
}
}
#[cfg(test)]
pub(crate) fn completed_session_loop_termination() -> SessionLoopTermination {
futures::future::ready(()).boxed().shared()
}
pub(crate) fn session_loop_termination_from_handle(
handle: JoinHandle<()>,
) -> SessionLoopTermination {
async move {
let _ = handle.await;
}
.boxed()
.shared()
}
/// Context for an initialized model agent
///
/// A session has at most 1 running task at a time, and can be interrupted by user input.