mirror of
https://github.com/openai/codex.git
synced 2026-05-05 11:57:33 +00:00
feat(app-server): propagate traces across tasks and core ops (#14387)
## Summary This PR keeps app-server RPC request trace context alive for the full lifetime of the work that request kicks off (e.g. for `thread/start`, this is `app-server rpc handler -> tokio background task -> core op submissions`). Previously we lose trace lineage once the request handler returns or hands work off to background tasks. This approach is especially relevant for `thread/start` and other RPC handlers that run in a non-blocking way. In the near future we'll most likely want to make all app-server handlers run in a non-blocking way by default, and only queue operations that must operate in order (e.g. thread RPCs per thread?), so we want to make sure tracing in app-server just generally works. Depends on https://github.com/openai/codex/pull/14300 **Before** <img width="155" height="207" alt="image" src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af" /> **After** <img width="299" height="337" alt="image" src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea" /> ## What changed - Keep request-scoped trace context around until we send the final response or error, or the connection closes. - Thread that trace context through detached `thread/start` work so background startup stays attached to the originating request. - Pass request trace context through to downstream core operations, including: - thread creation - resume/fork flows - turn submission - review - interrupt - realtime conversation operations - Add tracing tests that verify: - remote W3C trace context is preserved for `thread/start` - remote W3C trace context is preserved for `turn/start` - downstream core spans stay under the originating request span - request-scoped tracing state is cleaned up correctly - Clean up shutdown behavior so detached background tasks and spawned threads are drained before process exit.
This commit is contained in:
@@ -104,6 +104,7 @@ use codex_protocol::protocol::TurnAbortReason;
|
||||
use codex_protocol::protocol::TurnContextItem;
|
||||
use codex_protocol::protocol::TurnContextNetworkItem;
|
||||
use codex_protocol::protocol::TurnStartedEvent;
|
||||
use codex_protocol::protocol::W3cTraceContext;
|
||||
use codex_protocol::request_permissions::PermissionGrantScope;
|
||||
use codex_protocol::request_permissions::RequestPermissionsArgs;
|
||||
use codex_protocol::request_permissions::RequestPermissionsEvent;
|
||||
@@ -118,6 +119,7 @@ use codex_utils_stream_parser::ProposedPlanSegment;
|
||||
use codex_utils_stream_parser::extract_proposed_plan_text;
|
||||
use codex_utils_stream_parser::strip_citations;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::future::Shared;
|
||||
use futures::prelude::*;
|
||||
use futures::stream::FuturesOrdered;
|
||||
use rmcp::model::ListResourceTemplatesResult;
|
||||
@@ -330,8 +332,13 @@ pub struct Codex {
|
||||
// Last known status of the agent.
|
||||
pub(crate) agent_status: watch::Receiver<AgentStatus>,
|
||||
pub(crate) session: Arc<Session>,
|
||||
// Shared future for the background submission loop completion so multiple
|
||||
// callers can wait for shutdown.
|
||||
pub(crate) session_loop_termination: SessionLoopTermination,
|
||||
}
|
||||
|
||||
pub(crate) type SessionLoopTermination = Shared<BoxFuture<'static, ()>>;
|
||||
|
||||
/// Wrapper returned by [`Codex::spawn`] containing the spawned [`Codex`],
|
||||
/// the submission id for the initial `ConfigureSession` request and the
|
||||
/// unique session id.
|
||||
@@ -342,6 +349,24 @@ pub struct CodexSpawnOk {
|
||||
pub conversation_id: ThreadId,
|
||||
}
|
||||
|
||||
pub(crate) struct CodexSpawnArgs {
|
||||
pub(crate) config: Config,
|
||||
pub(crate) auth_manager: Arc<AuthManager>,
|
||||
pub(crate) models_manager: Arc<ModelsManager>,
|
||||
pub(crate) skills_manager: Arc<SkillsManager>,
|
||||
pub(crate) plugins_manager: Arc<PluginsManager>,
|
||||
pub(crate) mcp_manager: Arc<McpManager>,
|
||||
pub(crate) file_watcher: Arc<FileWatcher>,
|
||||
pub(crate) conversation_history: InitialHistory,
|
||||
pub(crate) session_source: SessionSource,
|
||||
pub(crate) agent_control: AgentControl,
|
||||
pub(crate) dynamic_tools: Vec<DynamicToolSpec>,
|
||||
pub(crate) persist_extended_history: bool,
|
||||
pub(crate) metrics_service_name: Option<String>,
|
||||
pub(crate) inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
|
||||
pub(crate) parent_trace: Option<W3cTraceContext>,
|
||||
}
|
||||
|
||||
pub(crate) const INITIAL_SUBMIT_ID: &str = "";
|
||||
pub(crate) const SUBMISSION_CHANNEL_CAPACITY: usize = 512;
|
||||
const CYBER_VERIFY_URL: &str = "https://chatgpt.com/cyber";
|
||||
@@ -349,23 +374,48 @@ const CYBER_SAFETY_URL: &str = "https://developers.openai.com/codex/concepts/cyb
|
||||
|
||||
impl Codex {
|
||||
/// Spawn a new [`Codex`] and initialize the session.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn spawn(
|
||||
mut config: Config,
|
||||
auth_manager: Arc<AuthManager>,
|
||||
models_manager: Arc<ModelsManager>,
|
||||
skills_manager: Arc<SkillsManager>,
|
||||
plugins_manager: Arc<PluginsManager>,
|
||||
mcp_manager: Arc<McpManager>,
|
||||
file_watcher: Arc<FileWatcher>,
|
||||
conversation_history: InitialHistory,
|
||||
session_source: SessionSource,
|
||||
agent_control: AgentControl,
|
||||
dynamic_tools: Vec<DynamicToolSpec>,
|
||||
persist_extended_history: bool,
|
||||
metrics_service_name: Option<String>,
|
||||
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
|
||||
) -> CodexResult<CodexSpawnOk> {
|
||||
pub(crate) async fn spawn(args: CodexSpawnArgs) -> CodexResult<CodexSpawnOk> {
|
||||
let parent_trace = match args.parent_trace {
|
||||
Some(trace) => {
|
||||
if codex_otel::context_from_w3c_trace_context(&trace).is_some() {
|
||||
Some(trace)
|
||||
} else {
|
||||
warn!("ignoring invalid thread spawn trace carrier");
|
||||
None
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
let thread_spawn_span = info_span!("thread_spawn", otel.name = "thread_spawn");
|
||||
if let Some(trace) = parent_trace.as_ref() {
|
||||
let _ = set_parent_from_w3c_trace_context(&thread_spawn_span, trace);
|
||||
}
|
||||
Self::spawn_internal(CodexSpawnArgs {
|
||||
parent_trace,
|
||||
..args
|
||||
})
|
||||
.instrument(thread_spawn_span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn spawn_internal(args: CodexSpawnArgs) -> CodexResult<CodexSpawnOk> {
|
||||
let CodexSpawnArgs {
|
||||
mut config,
|
||||
auth_manager,
|
||||
models_manager,
|
||||
skills_manager,
|
||||
plugins_manager,
|
||||
mcp_manager,
|
||||
file_watcher,
|
||||
conversation_history,
|
||||
session_source,
|
||||
agent_control,
|
||||
dynamic_tools,
|
||||
persist_extended_history,
|
||||
metrics_service_name,
|
||||
inherited_shell_snapshot,
|
||||
parent_trace: _,
|
||||
} = args;
|
||||
let (tx_sub, rx_sub) = async_channel::bounded(SUBMISSION_CHANNEL_CAPACITY);
|
||||
let (tx_event, rx_event) = async_channel::unbounded();
|
||||
|
||||
@@ -557,15 +607,18 @@ impl Codex {
|
||||
let thread_id = session.conversation_id;
|
||||
|
||||
// This task will run until Op::Shutdown is received.
|
||||
let session_loop_span = info_span!("session_loop", thread_id = %thread_id);
|
||||
tokio::spawn(
|
||||
submission_loop(Arc::clone(&session), config, rx_sub).instrument(session_loop_span),
|
||||
);
|
||||
let session_for_loop = Arc::clone(&session);
|
||||
let session_loop_handle = tokio::spawn(async move {
|
||||
submission_loop(session_for_loop, config, rx_sub)
|
||||
.instrument(info_span!("session_loop", thread_id = %thread_id))
|
||||
.await;
|
||||
});
|
||||
let codex = Codex {
|
||||
tx_sub,
|
||||
rx_event,
|
||||
agent_status: agent_status_rx,
|
||||
session,
|
||||
session_loop_termination: session_loop_termination_from_handle(session_loop_handle),
|
||||
};
|
||||
|
||||
#[allow(deprecated)]
|
||||
@@ -578,11 +631,19 @@ impl Codex {
|
||||
|
||||
/// Submit the `op` wrapped in a `Submission` with a unique ID.
|
||||
pub async fn submit(&self, op: Op) -> CodexResult<String> {
|
||||
self.submit_with_trace(op, None).await
|
||||
}
|
||||
|
||||
pub async fn submit_with_trace(
|
||||
&self,
|
||||
op: Op,
|
||||
trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<String> {
|
||||
let id = Uuid::now_v7().to_string();
|
||||
let sub = Submission {
|
||||
id: id.clone(),
|
||||
op,
|
||||
trace: None,
|
||||
trace,
|
||||
};
|
||||
self.submit_with_id(sub).await?;
|
||||
Ok(id)
|
||||
@@ -601,6 +662,17 @@ impl Codex {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn shutdown_and_wait(&self) -> CodexResult<()> {
|
||||
let session_loop_termination = self.session_loop_termination.clone();
|
||||
match self.submit(Op::Shutdown).await {
|
||||
Ok(_) => {}
|
||||
Err(CodexErr::InternalAgentDied) => {}
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
session_loop_termination.await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn next_event(&self) -> CodexResult<Event> {
|
||||
let event = self
|
||||
.rx_event
|
||||
@@ -648,6 +720,21 @@ impl Codex {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn completed_session_loop_termination() -> SessionLoopTermination {
|
||||
futures::future::ready(()).boxed().shared()
|
||||
}
|
||||
|
||||
pub(crate) fn session_loop_termination_from_handle(
|
||||
handle: JoinHandle<()>,
|
||||
) -> SessionLoopTermination {
|
||||
async move {
|
||||
let _ = handle.await;
|
||||
}
|
||||
.boxed()
|
||||
.shared()
|
||||
}
|
||||
|
||||
/// Context for an initialized model agent
|
||||
///
|
||||
/// A session has at most 1 running task at a time, and can be interrupted by user input.
|
||||
|
||||
Reference in New Issue
Block a user