mirror of
https://github.com/openai/codex.git
synced 2026-05-01 01:47:18 +00:00
feat(app-server): propagate traces across tasks and core ops (#14387)
## Summary This PR keeps app-server RPC request trace context alive for the full lifetime of the work that request kicks off (e.g. for `thread/start`, this is `app-server rpc handler -> tokio background task -> core op submissions`). Previously we lose trace lineage once the request handler returns or hands work off to background tasks. This approach is especially relevant for `thread/start` and other RPC handlers that run in a non-blocking way. In the near future we'll most likely want to make all app-server handlers run in a non-blocking way by default, and only queue operations that must operate in order (e.g. thread RPCs per thread?), so we want to make sure tracing in app-server just generally works. Depends on https://github.com/openai/codex/pull/14300 **Before** <img width="155" height="207" alt="image" src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af" /> **After** <img width="299" height="337" alt="image" src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea" /> ## What changed - Keep request-scoped trace context around until we send the final response or error, or the connection closes. - Thread that trace context through detached `thread/start` work so background startup stays attached to the originating request. - Pass request trace context through to downstream core operations, including: - thread creation - resume/fork flows - turn submission - review - interrupt - realtime conversation operations - Add tracing tests that verify: - remote W3C trace context is preserved for `thread/start` - remote W3C trace context is preserved for `turn/start` - downstream core spans stay under the originating request span - request-scoped tracing state is cleaned up correctly - Clean up shutdown behavior so detached background tasks and spawned threads are drained before process exit.
This commit is contained in:
@@ -13,6 +13,7 @@ use crate::outgoing_message::ConnectionId;
|
||||
use crate::outgoing_message::ConnectionRequestId;
|
||||
use crate::outgoing_message::OutgoingMessageSender;
|
||||
use crate::outgoing_message::OutgoingNotification;
|
||||
use crate::outgoing_message::RequestContext;
|
||||
use crate::outgoing_message::ThreadScopedOutgoingMessageSender;
|
||||
use crate::thread_status::ThreadWatchManager;
|
||||
use crate::thread_status::resolve_thread_status;
|
||||
@@ -203,6 +204,7 @@ use codex_core::connectors::filter_disallowed_connectors;
|
||||
use codex_core::connectors::merge_plugin_apps;
|
||||
use codex_core::default_client::set_default_client_residency_requirement;
|
||||
use codex_core::error::CodexErr;
|
||||
use codex_core::error::Result as CodexResult;
|
||||
use codex_core::exec::ExecExpiration;
|
||||
use codex_core::exec::ExecParams;
|
||||
use codex_core::exec_env::create_env;
|
||||
@@ -269,6 +271,7 @@ use codex_protocol::protocol::RolloutItem;
|
||||
use codex_protocol::protocol::SessionConfiguredEvent;
|
||||
use codex_protocol::protocol::SessionMetaLine;
|
||||
use codex_protocol::protocol::USER_MESSAGE_BEGIN;
|
||||
use codex_protocol::protocol::W3cTraceContext;
|
||||
use codex_protocol::user_input::MAX_USER_INPUT_TEXT_CHARS;
|
||||
use codex_protocol::user_input::UserInput as CoreInputItem;
|
||||
use codex_rmcp_client::perform_oauth_login_return_url;
|
||||
@@ -296,7 +299,9 @@ use tokio::sync::broadcast;
|
||||
use tokio::sync::oneshot;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tokio_util::task::TaskTracker;
|
||||
use toml::Value as TomlValue;
|
||||
use tracing::Instrument;
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
@@ -386,6 +391,7 @@ pub(crate) struct CodexMessageProcessor {
|
||||
command_exec_manager: CommandExecManager,
|
||||
pending_fuzzy_searches: Arc<Mutex<HashMap<String, Arc<AtomicBool>>>>,
|
||||
fuzzy_search_sessions: Arc<Mutex<HashMap<String, FuzzyFileSearchSession>>>,
|
||||
background_tasks: TaskTracker,
|
||||
feedback: CodexFeedback,
|
||||
log_db: Option<LogDbLayer>,
|
||||
}
|
||||
@@ -500,6 +506,7 @@ impl CodexMessageProcessor {
|
||||
command_exec_manager: CommandExecManager::default(),
|
||||
pending_fuzzy_searches: Arc::new(Mutex::new(HashMap::new())),
|
||||
fuzzy_search_sessions: Arc::new(Mutex::new(HashMap::new())),
|
||||
background_tasks: TaskTracker::new(),
|
||||
feedback,
|
||||
log_db,
|
||||
}
|
||||
@@ -620,6 +627,7 @@ impl CodexMessageProcessor {
|
||||
connection_id: ConnectionId,
|
||||
request: ClientRequest,
|
||||
app_server_client_name: Option<String>,
|
||||
request_context: RequestContext,
|
||||
) {
|
||||
let to_connection_request_id = |request_id| ConnectionRequestId {
|
||||
connection_id,
|
||||
@@ -632,8 +640,12 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
// === v2 Thread/Turn APIs ===
|
||||
ClientRequest::ThreadStart { request_id, params } => {
|
||||
self.thread_start(to_connection_request_id(request_id), params)
|
||||
.await;
|
||||
self.thread_start(
|
||||
to_connection_request_id(request_id),
|
||||
params,
|
||||
request_context,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
ClientRequest::ThreadUnsubscribe { request_id, params } => {
|
||||
self.thread_unsubscribe(to_connection_request_id(request_id), params)
|
||||
@@ -1806,7 +1818,12 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
async fn thread_start(&self, request_id: ConnectionRequestId, params: ThreadStartParams) {
|
||||
async fn thread_start(
|
||||
&self,
|
||||
request_id: ConnectionRequestId,
|
||||
params: ThreadStartParams,
|
||||
request_context: RequestContext,
|
||||
) {
|
||||
let ThreadStartParams {
|
||||
model,
|
||||
model_provider,
|
||||
@@ -1847,8 +1864,8 @@ impl CodexMessageProcessor {
|
||||
fallback_model_provider: self.config.model_provider_id.clone(),
|
||||
codex_home: self.config.codex_home.clone(),
|
||||
};
|
||||
|
||||
tokio::spawn(async move {
|
||||
let request_trace = request_context.request_trace();
|
||||
let thread_start_task = async move {
|
||||
Self::thread_start_task(
|
||||
listener_task_context,
|
||||
cli_overrides,
|
||||
@@ -1860,9 +1877,53 @@ impl CodexMessageProcessor {
|
||||
persist_extended_history,
|
||||
service_name,
|
||||
experimental_raw_events,
|
||||
request_trace,
|
||||
)
|
||||
.await;
|
||||
});
|
||||
};
|
||||
self.background_tasks
|
||||
.spawn(thread_start_task.instrument(request_context.span()));
|
||||
}
|
||||
|
||||
pub(crate) async fn drain_background_tasks(&self) {
|
||||
self.background_tasks.close();
|
||||
if tokio::time::timeout(Duration::from_secs(10), self.background_tasks.wait())
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
warn!("timed out waiting for background tasks to shut down; proceeding");
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn shutdown_threads(&self) {
|
||||
let report = self
|
||||
.thread_manager
|
||||
.shutdown_all_threads_bounded(Duration::from_secs(10))
|
||||
.await;
|
||||
for thread_id in report.submit_failed {
|
||||
warn!("failed to submit Shutdown to thread {thread_id}");
|
||||
}
|
||||
for thread_id in report.timed_out {
|
||||
warn!("timed out waiting for thread {thread_id} to shut down");
|
||||
}
|
||||
}
|
||||
|
||||
async fn request_trace_context(
|
||||
&self,
|
||||
request_id: &ConnectionRequestId,
|
||||
) -> Option<codex_protocol::protocol::W3cTraceContext> {
|
||||
self.outgoing.request_trace_context(request_id).await
|
||||
}
|
||||
|
||||
async fn submit_core_op(
|
||||
&self,
|
||||
request_id: &ConnectionRequestId,
|
||||
thread: &CodexThread,
|
||||
op: Op,
|
||||
) -> CodexResult<String> {
|
||||
thread
|
||||
.submit_with_trace(op, self.request_trace_context(request_id).await)
|
||||
.await
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -1877,6 +1938,7 @@ impl CodexMessageProcessor {
|
||||
persist_extended_history: bool,
|
||||
service_name: Option<String>,
|
||||
experimental_raw_events: bool,
|
||||
request_trace: Option<W3cTraceContext>,
|
||||
) {
|
||||
let config = match derive_config_from_params(
|
||||
&cli_overrides,
|
||||
@@ -1934,6 +1996,7 @@ impl CodexMessageProcessor {
|
||||
core_dynamic_tools,
|
||||
persist_extended_history,
|
||||
service_name,
|
||||
request_trace,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -2199,7 +2262,10 @@ impl CodexMessageProcessor {
|
||||
};
|
||||
|
||||
if let Ok(thread) = self.thread_manager.get_thread(thread_id).await {
|
||||
if let Err(err) = thread.submit(Op::SetThreadName { name }).await {
|
||||
if let Err(err) = self
|
||||
.submit_core_op(&request_id, thread.as_ref(), Op::SetThreadName { name })
|
||||
.await
|
||||
{
|
||||
self.send_internal_error(request_id, format!("failed to set thread name: {err}"))
|
||||
.await;
|
||||
return;
|
||||
@@ -2784,7 +2850,14 @@ impl CodexMessageProcessor {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Err(err) = thread.submit(Op::ThreadRollback { num_turns }).await {
|
||||
if let Err(err) = self
|
||||
.submit_core_op(
|
||||
&request_id,
|
||||
thread.as_ref(),
|
||||
Op::ThreadRollback { num_turns },
|
||||
)
|
||||
.await
|
||||
{
|
||||
// No ThreadRollback event will arrive if an error occurs.
|
||||
// Clean up and reply immediately.
|
||||
let thread_state = self.thread_state_manager.thread_state(thread_id).await;
|
||||
@@ -2812,7 +2885,10 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
};
|
||||
|
||||
match thread.submit(Op::Compact).await {
|
||||
match self
|
||||
.submit_core_op(&request_id, thread.as_ref(), Op::Compact)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
self.outgoing
|
||||
.send_response(request_id, ThreadCompactStartResponse {})
|
||||
@@ -2840,7 +2916,10 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
};
|
||||
|
||||
match thread.submit(Op::CleanBackgroundTerminals).await {
|
||||
match self
|
||||
.submit_core_op(&request_id, thread.as_ref(), Op::CleanBackgroundTerminals)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
self.outgoing
|
||||
.send_response(request_id, ThreadBackgroundTerminalsCleanResponse {})
|
||||
@@ -3298,6 +3377,7 @@ impl CodexMessageProcessor {
|
||||
thread_history,
|
||||
self.auth_manager.clone(),
|
||||
persist_extended_history,
|
||||
self.request_trace_context(&request_id).await,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -3823,6 +3903,7 @@ impl CodexMessageProcessor {
|
||||
config,
|
||||
rollout_path.clone(),
|
||||
persist_extended_history,
|
||||
self.request_trace_context(&request_id).await,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -4694,26 +4775,10 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
|
||||
async fn wait_for_thread_shutdown(thread: &Arc<CodexThread>) -> ThreadShutdownResult {
|
||||
match thread.submit(Op::Shutdown).await {
|
||||
Ok(_) => {
|
||||
let wait_for_shutdown = async {
|
||||
loop {
|
||||
if matches!(thread.agent_status().await, AgentStatus::Shutdown) {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
};
|
||||
if tokio::time::timeout(Duration::from_secs(10), wait_for_shutdown)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
ThreadShutdownResult::TimedOut
|
||||
} else {
|
||||
ThreadShutdownResult::Complete
|
||||
}
|
||||
}
|
||||
Err(_) => ThreadShutdownResult::SubmitFailed,
|
||||
match tokio::time::timeout(Duration::from_secs(10), thread.shutdown_and_wait()).await {
|
||||
Ok(Ok(())) => ThreadShutdownResult::Complete,
|
||||
Ok(Err(_)) => ThreadShutdownResult::SubmitFailed,
|
||||
Err(_) => ThreadShutdownResult::TimedOut,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5799,28 +5864,36 @@ impl CodexMessageProcessor {
|
||||
|
||||
// If any overrides are provided, update the session turn context first.
|
||||
if has_any_overrides {
|
||||
let _ = thread
|
||||
.submit(Op::OverrideTurnContext {
|
||||
cwd: params.cwd,
|
||||
approval_policy: params.approval_policy.map(AskForApproval::to_core),
|
||||
sandbox_policy: params.sandbox_policy.map(|p| p.to_core()),
|
||||
windows_sandbox_level: None,
|
||||
model: params.model,
|
||||
effort: params.effort.map(Some),
|
||||
summary: params.summary,
|
||||
service_tier: params.service_tier,
|
||||
collaboration_mode,
|
||||
personality: params.personality,
|
||||
})
|
||||
let _ = self
|
||||
.submit_core_op(
|
||||
&request_id,
|
||||
thread.as_ref(),
|
||||
Op::OverrideTurnContext {
|
||||
cwd: params.cwd,
|
||||
approval_policy: params.approval_policy.map(AskForApproval::to_core),
|
||||
sandbox_policy: params.sandbox_policy.map(|p| p.to_core()),
|
||||
windows_sandbox_level: None,
|
||||
model: params.model,
|
||||
effort: params.effort.map(Some),
|
||||
summary: params.summary,
|
||||
service_tier: params.service_tier,
|
||||
collaboration_mode,
|
||||
personality: params.personality,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// Start the turn by submitting the user input. Return its submission id as turn_id.
|
||||
let turn_id = thread
|
||||
.submit(Op::UserInput {
|
||||
items: mapped_items,
|
||||
final_output_json_schema: params.output_schema,
|
||||
})
|
||||
let turn_id = self
|
||||
.submit_core_op(
|
||||
&request_id,
|
||||
thread.as_ref(),
|
||||
Op::UserInput {
|
||||
items: mapped_items,
|
||||
final_output_json_schema: params.output_schema,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
match turn_id {
|
||||
@@ -5977,11 +6050,15 @@ impl CodexMessageProcessor {
|
||||
return;
|
||||
};
|
||||
|
||||
let submit = thread
|
||||
.submit(Op::RealtimeConversationStart(ConversationStartParams {
|
||||
prompt: params.prompt,
|
||||
session_id: params.session_id,
|
||||
}))
|
||||
let submit = self
|
||||
.submit_core_op(
|
||||
&request_id,
|
||||
thread.as_ref(),
|
||||
Op::RealtimeConversationStart(ConversationStartParams {
|
||||
prompt: params.prompt,
|
||||
session_id: params.session_id,
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
|
||||
match submit {
|
||||
@@ -6012,10 +6089,14 @@ impl CodexMessageProcessor {
|
||||
return;
|
||||
};
|
||||
|
||||
let submit = thread
|
||||
.submit(Op::RealtimeConversationAudio(ConversationAudioParams {
|
||||
frame: params.audio.into(),
|
||||
}))
|
||||
let submit = self
|
||||
.submit_core_op(
|
||||
&request_id,
|
||||
thread.as_ref(),
|
||||
Op::RealtimeConversationAudio(ConversationAudioParams {
|
||||
frame: params.audio.into(),
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
|
||||
match submit {
|
||||
@@ -6046,10 +6127,12 @@ impl CodexMessageProcessor {
|
||||
return;
|
||||
};
|
||||
|
||||
let submit = thread
|
||||
.submit(Op::RealtimeConversationText(ConversationTextParams {
|
||||
text: params.text,
|
||||
}))
|
||||
let submit = self
|
||||
.submit_core_op(
|
||||
&request_id,
|
||||
thread.as_ref(),
|
||||
Op::RealtimeConversationText(ConversationTextParams { text: params.text }),
|
||||
)
|
||||
.await;
|
||||
|
||||
match submit {
|
||||
@@ -6080,7 +6163,9 @@ impl CodexMessageProcessor {
|
||||
return;
|
||||
};
|
||||
|
||||
let submit = thread.submit(Op::RealtimeConversationClose).await;
|
||||
let submit = self
|
||||
.submit_core_op(&request_id, thread.as_ref(), Op::RealtimeConversationClose)
|
||||
.await;
|
||||
|
||||
match submit {
|
||||
Ok(_) => {
|
||||
@@ -6143,7 +6228,13 @@ impl CodexMessageProcessor {
|
||||
display_text: &str,
|
||||
parent_thread_id: String,
|
||||
) -> std::result::Result<(), JSONRPCErrorError> {
|
||||
let turn_id = parent_thread.submit(Op::Review { review_request }).await;
|
||||
let turn_id = self
|
||||
.submit_core_op(
|
||||
request_id,
|
||||
parent_thread.as_ref(),
|
||||
Op::Review { review_request },
|
||||
)
|
||||
.await;
|
||||
|
||||
match turn_id {
|
||||
Ok(turn_id) => {
|
||||
@@ -6197,7 +6288,13 @@ impl CodexMessageProcessor {
|
||||
..
|
||||
} = self
|
||||
.thread_manager
|
||||
.fork_thread(usize::MAX, config, rollout_path, false)
|
||||
.fork_thread(
|
||||
usize::MAX,
|
||||
config,
|
||||
rollout_path,
|
||||
false,
|
||||
self.request_trace_context(request_id).await,
|
||||
)
|
||||
.await
|
||||
.map_err(|err| JSONRPCErrorError {
|
||||
code: INTERNAL_ERROR_CODE,
|
||||
@@ -6252,8 +6349,12 @@ impl CodexMessageProcessor {
|
||||
);
|
||||
}
|
||||
|
||||
let turn_id = review_thread
|
||||
.submit(Op::Review { review_request })
|
||||
let turn_id = self
|
||||
.submit_core_op(
|
||||
request_id,
|
||||
review_thread.as_ref(),
|
||||
Op::Review { review_request },
|
||||
)
|
||||
.await
|
||||
.map_err(|err| JSONRPCErrorError {
|
||||
code: INTERNAL_ERROR_CODE,
|
||||
@@ -6351,7 +6452,9 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
|
||||
// Submit the interrupt; we'll respond upon TurnAborted.
|
||||
let _ = thread.submit(Op::Interrupt).await;
|
||||
let _ = self
|
||||
.submit_core_op(&request_id, thread.as_ref(), Op::Interrupt)
|
||||
.await;
|
||||
}
|
||||
|
||||
async fn ensure_conversation_listener(
|
||||
|
||||
Reference in New Issue
Block a user