feat(app-server): propagate traces across tasks and core ops (#14387)

## Summary

This PR keeps app-server RPC request trace context alive for the full
lifetime of the work that request kicks off (e.g. for `thread/start`,
this is `app-server rpc handler -> tokio background task -> core op
submissions`). Previously we lose trace lineage once the request handler
returns or hands work off to background tasks.

This approach is especially relevant for `thread/start` and other RPC
handlers that run in a non-blocking way. In the near future we'll most
likely want to make all app-server handlers run in a non-blocking way by
default, and only queue operations that must operate in order (e.g.
thread RPCs per thread?), so we want to make sure tracing in app-server
just generally works.

Depends on https://github.com/openai/codex/pull/14300

**Before**
<img width="155" height="207" alt="image"
src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af"
/>


**After**
<img width="299" height="337" alt="image"
src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea"
/>

## What changed

- Keep request-scoped trace context around until we send the final
response or error, or the connection closes.
- Thread that trace context through detached `thread/start` work so
background startup stays attached to the originating request.
- Pass request trace context through to downstream core operations,
including:
  - thread creation
  - resume/fork flows
  - turn submission
  - review
  - interrupt
  - realtime conversation operations
- Add tracing tests that verify:
  - remote W3C trace context is preserved for `thread/start`
  - remote W3C trace context is preserved for `turn/start`
  - downstream core spans stay under the originating request span
  - request-scoped tracing state is cleaned up correctly
- Clean up shutdown behavior so detached background tasks and spawned
threads are drained before process exit.
This commit is contained in:
Owen Lin
2026-03-11 20:18:31 -07:00
committed by GitHub
parent bf5e997b31
commit 5bc82c5b93
24 changed files with 1524 additions and 308 deletions

View File

@@ -13,6 +13,7 @@ use crate::outgoing_message::ConnectionId;
use crate::outgoing_message::ConnectionRequestId;
use crate::outgoing_message::OutgoingMessageSender;
use crate::outgoing_message::OutgoingNotification;
use crate::outgoing_message::RequestContext;
use crate::outgoing_message::ThreadScopedOutgoingMessageSender;
use crate::thread_status::ThreadWatchManager;
use crate::thread_status::resolve_thread_status;
@@ -203,6 +204,7 @@ use codex_core::connectors::filter_disallowed_connectors;
use codex_core::connectors::merge_plugin_apps;
use codex_core::default_client::set_default_client_residency_requirement;
use codex_core::error::CodexErr;
use codex_core::error::Result as CodexResult;
use codex_core::exec::ExecExpiration;
use codex_core::exec::ExecParams;
use codex_core::exec_env::create_env;
@@ -269,6 +271,7 @@ use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::SessionConfiguredEvent;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::USER_MESSAGE_BEGIN;
use codex_protocol::protocol::W3cTraceContext;
use codex_protocol::user_input::MAX_USER_INPUT_TEXT_CHARS;
use codex_protocol::user_input::UserInput as CoreInputItem;
use codex_rmcp_client::perform_oauth_login_return_url;
@@ -296,7 +299,9 @@ use tokio::sync::broadcast;
use tokio::sync::oneshot;
use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tokio_util::task::TaskTracker;
use toml::Value as TomlValue;
use tracing::Instrument;
use tracing::error;
use tracing::info;
use tracing::warn;
@@ -386,6 +391,7 @@ pub(crate) struct CodexMessageProcessor {
command_exec_manager: CommandExecManager,
pending_fuzzy_searches: Arc<Mutex<HashMap<String, Arc<AtomicBool>>>>,
fuzzy_search_sessions: Arc<Mutex<HashMap<String, FuzzyFileSearchSession>>>,
background_tasks: TaskTracker,
feedback: CodexFeedback,
log_db: Option<LogDbLayer>,
}
@@ -500,6 +506,7 @@ impl CodexMessageProcessor {
command_exec_manager: CommandExecManager::default(),
pending_fuzzy_searches: Arc::new(Mutex::new(HashMap::new())),
fuzzy_search_sessions: Arc::new(Mutex::new(HashMap::new())),
background_tasks: TaskTracker::new(),
feedback,
log_db,
}
@@ -620,6 +627,7 @@ impl CodexMessageProcessor {
connection_id: ConnectionId,
request: ClientRequest,
app_server_client_name: Option<String>,
request_context: RequestContext,
) {
let to_connection_request_id = |request_id| ConnectionRequestId {
connection_id,
@@ -632,8 +640,12 @@ impl CodexMessageProcessor {
}
// === v2 Thread/Turn APIs ===
ClientRequest::ThreadStart { request_id, params } => {
self.thread_start(to_connection_request_id(request_id), params)
.await;
self.thread_start(
to_connection_request_id(request_id),
params,
request_context,
)
.await;
}
ClientRequest::ThreadUnsubscribe { request_id, params } => {
self.thread_unsubscribe(to_connection_request_id(request_id), params)
@@ -1806,7 +1818,12 @@ impl CodexMessageProcessor {
}
}
async fn thread_start(&self, request_id: ConnectionRequestId, params: ThreadStartParams) {
async fn thread_start(
&self,
request_id: ConnectionRequestId,
params: ThreadStartParams,
request_context: RequestContext,
) {
let ThreadStartParams {
model,
model_provider,
@@ -1847,8 +1864,8 @@ impl CodexMessageProcessor {
fallback_model_provider: self.config.model_provider_id.clone(),
codex_home: self.config.codex_home.clone(),
};
tokio::spawn(async move {
let request_trace = request_context.request_trace();
let thread_start_task = async move {
Self::thread_start_task(
listener_task_context,
cli_overrides,
@@ -1860,9 +1877,53 @@ impl CodexMessageProcessor {
persist_extended_history,
service_name,
experimental_raw_events,
request_trace,
)
.await;
});
};
self.background_tasks
.spawn(thread_start_task.instrument(request_context.span()));
}
pub(crate) async fn drain_background_tasks(&self) {
self.background_tasks.close();
if tokio::time::timeout(Duration::from_secs(10), self.background_tasks.wait())
.await
.is_err()
{
warn!("timed out waiting for background tasks to shut down; proceeding");
}
}
pub(crate) async fn shutdown_threads(&self) {
let report = self
.thread_manager
.shutdown_all_threads_bounded(Duration::from_secs(10))
.await;
for thread_id in report.submit_failed {
warn!("failed to submit Shutdown to thread {thread_id}");
}
for thread_id in report.timed_out {
warn!("timed out waiting for thread {thread_id} to shut down");
}
}
async fn request_trace_context(
&self,
request_id: &ConnectionRequestId,
) -> Option<codex_protocol::protocol::W3cTraceContext> {
self.outgoing.request_trace_context(request_id).await
}
async fn submit_core_op(
&self,
request_id: &ConnectionRequestId,
thread: &CodexThread,
op: Op,
) -> CodexResult<String> {
thread
.submit_with_trace(op, self.request_trace_context(request_id).await)
.await
}
#[allow(clippy::too_many_arguments)]
@@ -1877,6 +1938,7 @@ impl CodexMessageProcessor {
persist_extended_history: bool,
service_name: Option<String>,
experimental_raw_events: bool,
request_trace: Option<W3cTraceContext>,
) {
let config = match derive_config_from_params(
&cli_overrides,
@@ -1934,6 +1996,7 @@ impl CodexMessageProcessor {
core_dynamic_tools,
persist_extended_history,
service_name,
request_trace,
)
.await
{
@@ -2199,7 +2262,10 @@ impl CodexMessageProcessor {
};
if let Ok(thread) = self.thread_manager.get_thread(thread_id).await {
if let Err(err) = thread.submit(Op::SetThreadName { name }).await {
if let Err(err) = self
.submit_core_op(&request_id, thread.as_ref(), Op::SetThreadName { name })
.await
{
self.send_internal_error(request_id, format!("failed to set thread name: {err}"))
.await;
return;
@@ -2784,7 +2850,14 @@ impl CodexMessageProcessor {
return;
}
if let Err(err) = thread.submit(Op::ThreadRollback { num_turns }).await {
if let Err(err) = self
.submit_core_op(
&request_id,
thread.as_ref(),
Op::ThreadRollback { num_turns },
)
.await
{
// No ThreadRollback event will arrive if an error occurs.
// Clean up and reply immediately.
let thread_state = self.thread_state_manager.thread_state(thread_id).await;
@@ -2812,7 +2885,10 @@ impl CodexMessageProcessor {
}
};
match thread.submit(Op::Compact).await {
match self
.submit_core_op(&request_id, thread.as_ref(), Op::Compact)
.await
{
Ok(_) => {
self.outgoing
.send_response(request_id, ThreadCompactStartResponse {})
@@ -2840,7 +2916,10 @@ impl CodexMessageProcessor {
}
};
match thread.submit(Op::CleanBackgroundTerminals).await {
match self
.submit_core_op(&request_id, thread.as_ref(), Op::CleanBackgroundTerminals)
.await
{
Ok(_) => {
self.outgoing
.send_response(request_id, ThreadBackgroundTerminalsCleanResponse {})
@@ -3298,6 +3377,7 @@ impl CodexMessageProcessor {
thread_history,
self.auth_manager.clone(),
persist_extended_history,
self.request_trace_context(&request_id).await,
)
.await
{
@@ -3823,6 +3903,7 @@ impl CodexMessageProcessor {
config,
rollout_path.clone(),
persist_extended_history,
self.request_trace_context(&request_id).await,
)
.await
{
@@ -4694,26 +4775,10 @@ impl CodexMessageProcessor {
}
async fn wait_for_thread_shutdown(thread: &Arc<CodexThread>) -> ThreadShutdownResult {
match thread.submit(Op::Shutdown).await {
Ok(_) => {
let wait_for_shutdown = async {
loop {
if matches!(thread.agent_status().await, AgentStatus::Shutdown) {
break;
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
};
if tokio::time::timeout(Duration::from_secs(10), wait_for_shutdown)
.await
.is_err()
{
ThreadShutdownResult::TimedOut
} else {
ThreadShutdownResult::Complete
}
}
Err(_) => ThreadShutdownResult::SubmitFailed,
match tokio::time::timeout(Duration::from_secs(10), thread.shutdown_and_wait()).await {
Ok(Ok(())) => ThreadShutdownResult::Complete,
Ok(Err(_)) => ThreadShutdownResult::SubmitFailed,
Err(_) => ThreadShutdownResult::TimedOut,
}
}
@@ -5799,28 +5864,36 @@ impl CodexMessageProcessor {
// If any overrides are provided, update the session turn context first.
if has_any_overrides {
let _ = thread
.submit(Op::OverrideTurnContext {
cwd: params.cwd,
approval_policy: params.approval_policy.map(AskForApproval::to_core),
sandbox_policy: params.sandbox_policy.map(|p| p.to_core()),
windows_sandbox_level: None,
model: params.model,
effort: params.effort.map(Some),
summary: params.summary,
service_tier: params.service_tier,
collaboration_mode,
personality: params.personality,
})
let _ = self
.submit_core_op(
&request_id,
thread.as_ref(),
Op::OverrideTurnContext {
cwd: params.cwd,
approval_policy: params.approval_policy.map(AskForApproval::to_core),
sandbox_policy: params.sandbox_policy.map(|p| p.to_core()),
windows_sandbox_level: None,
model: params.model,
effort: params.effort.map(Some),
summary: params.summary,
service_tier: params.service_tier,
collaboration_mode,
personality: params.personality,
},
)
.await;
}
// Start the turn by submitting the user input. Return its submission id as turn_id.
let turn_id = thread
.submit(Op::UserInput {
items: mapped_items,
final_output_json_schema: params.output_schema,
})
let turn_id = self
.submit_core_op(
&request_id,
thread.as_ref(),
Op::UserInput {
items: mapped_items,
final_output_json_schema: params.output_schema,
},
)
.await;
match turn_id {
@@ -5977,11 +6050,15 @@ impl CodexMessageProcessor {
return;
};
let submit = thread
.submit(Op::RealtimeConversationStart(ConversationStartParams {
prompt: params.prompt,
session_id: params.session_id,
}))
let submit = self
.submit_core_op(
&request_id,
thread.as_ref(),
Op::RealtimeConversationStart(ConversationStartParams {
prompt: params.prompt,
session_id: params.session_id,
}),
)
.await;
match submit {
@@ -6012,10 +6089,14 @@ impl CodexMessageProcessor {
return;
};
let submit = thread
.submit(Op::RealtimeConversationAudio(ConversationAudioParams {
frame: params.audio.into(),
}))
let submit = self
.submit_core_op(
&request_id,
thread.as_ref(),
Op::RealtimeConversationAudio(ConversationAudioParams {
frame: params.audio.into(),
}),
)
.await;
match submit {
@@ -6046,10 +6127,12 @@ impl CodexMessageProcessor {
return;
};
let submit = thread
.submit(Op::RealtimeConversationText(ConversationTextParams {
text: params.text,
}))
let submit = self
.submit_core_op(
&request_id,
thread.as_ref(),
Op::RealtimeConversationText(ConversationTextParams { text: params.text }),
)
.await;
match submit {
@@ -6080,7 +6163,9 @@ impl CodexMessageProcessor {
return;
};
let submit = thread.submit(Op::RealtimeConversationClose).await;
let submit = self
.submit_core_op(&request_id, thread.as_ref(), Op::RealtimeConversationClose)
.await;
match submit {
Ok(_) => {
@@ -6143,7 +6228,13 @@ impl CodexMessageProcessor {
display_text: &str,
parent_thread_id: String,
) -> std::result::Result<(), JSONRPCErrorError> {
let turn_id = parent_thread.submit(Op::Review { review_request }).await;
let turn_id = self
.submit_core_op(
request_id,
parent_thread.as_ref(),
Op::Review { review_request },
)
.await;
match turn_id {
Ok(turn_id) => {
@@ -6197,7 +6288,13 @@ impl CodexMessageProcessor {
..
} = self
.thread_manager
.fork_thread(usize::MAX, config, rollout_path, false)
.fork_thread(
usize::MAX,
config,
rollout_path,
false,
self.request_trace_context(request_id).await,
)
.await
.map_err(|err| JSONRPCErrorError {
code: INTERNAL_ERROR_CODE,
@@ -6252,8 +6349,12 @@ impl CodexMessageProcessor {
);
}
let turn_id = review_thread
.submit(Op::Review { review_request })
let turn_id = self
.submit_core_op(
request_id,
review_thread.as_ref(),
Op::Review { review_request },
)
.await
.map_err(|err| JSONRPCErrorError {
code: INTERNAL_ERROR_CODE,
@@ -6351,7 +6452,9 @@ impl CodexMessageProcessor {
}
// Submit the interrupt; we'll respond upon TurnAborted.
let _ = thread.submit(Op::Interrupt).await;
let _ = self
.submit_core_op(&request_id, thread.as_ref(), Op::Interrupt)
.await;
}
async fn ensure_conversation_listener(