diff --git a/codex-rs/app-server/src/request_processors/thread_processor.rs b/codex-rs/app-server/src/request_processors/thread_processor.rs index 4c14229729..9b5cef069d 100644 --- a/codex-rs/app-server/src/request_processors/thread_processor.rs +++ b/codex-rs/app-server/src/request_processors/thread_processor.rs @@ -960,6 +960,7 @@ impl ThreadRequestProcessor { experimental_raw_events: bool, request_trace: Option, ) -> Result<(), JSONRPCErrorError> { + let thread_start_started_at = std::time::Instant::now(); let requested_cwd = typesafe_overrides.cwd.clone(); let mut config = config_manager .load_with_overrides(config_overrides.clone(), typesafe_overrides.clone()) @@ -1054,6 +1055,7 @@ impl ThreadRequestProcessor { .collect() }; let core_dynamic_tool_count = core_dynamic_tools.len(); + let create_thread_started_at = std::time::Instant::now(); let NewThread { thread_id, thread, @@ -1088,6 +1090,12 @@ impl ThreadRequestProcessor { CodexErr::InvalidRequest(message) => invalid_request(message), err => internal_error(format!("error creating thread: {err}")), })?; + let session_telemetry = thread.session_telemetry(); + session_telemetry.record_startup_phase( + "thread_start_create_thread", + create_thread_started_at.elapsed(), + Some("ready"), + ); Self::set_app_server_client_info( thread.as_ref(), @@ -1189,6 +1197,11 @@ impl ThreadRequestProcessor { otel.name = "app_server.thread_start.notify_started", )) .await; + session_telemetry.record_startup_phase( + "thread_start_total", + thread_start_started_at.elapsed(), + Some("ready"), + ); Ok(()) } diff --git a/codex-rs/core/src/codex_thread.rs b/codex-rs/core/src/codex_thread.rs index a0dd95c37b..c110d30f66 100644 --- a/codex-rs/core/src/codex_thread.rs +++ b/codex-rs/core/src/codex_thread.rs @@ -6,6 +6,7 @@ use crate::session::Codex; use crate::session::SessionSettingsUpdate; use crate::session::SteerInputError; use codex_features::Feature; +use codex_otel::SessionTelemetry; use codex_protocol::config_types::ApprovalsReviewer; use codex_protocol::config_types::CollaborationMode; use codex_protocol::config_types::Personality; @@ -125,6 +126,11 @@ impl CodexThread { self.codex.submit(op).await } + /// Returns the session telemetry handle for thread-scoped production instrumentation. + pub fn session_telemetry(&self) -> SessionTelemetry { + self.codex.session.services.session_telemetry.clone() + } + pub async fn shutdown_and_wait(&self) -> CodexResult<()> { self.codex.shutdown_and_wait().await } diff --git a/codex-rs/core/src/session_startup_prewarm.rs b/codex-rs/core/src/session_startup_prewarm.rs index 93d14ff7d6..0fac4b1c9a 100644 --- a/codex-rs/core/src/session_startup_prewarm.rs +++ b/codex-rs/core/src/session_startup_prewarm.rs @@ -52,6 +52,7 @@ impl SessionStartupPrewarmHandle { session_telemetry: &SessionTelemetry, cancellation_token: &CancellationToken, ) -> SessionStartupPrewarmResolution { + let resolve_started_at = Instant::now(); let Self { mut task, started_at, @@ -78,6 +79,11 @@ impl SessionStartupPrewarmHandle { } None => { task.abort(); + session_telemetry.record_startup_phase( + "startup_prewarm_resolve", + resolve_started_at.elapsed(), + Some("cancelled"), + ); session_telemetry.record_duration( STARTUP_PREWARM_AGE_AT_FIRST_TURN_METRIC, age_at_first_turn, @@ -92,6 +98,16 @@ impl SessionStartupPrewarmHandle { } } }; + let status = match &resolution { + SessionStartupPrewarmResolution::Cancelled => "cancelled", + SessionStartupPrewarmResolution::Ready(_) => "ready", + SessionStartupPrewarmResolution::Unavailable { status, .. } => status, + }; + session_telemetry.record_startup_phase( + "startup_prewarm_resolve", + resolve_started_at.elapsed(), + Some(status), + ); match resolution { SessionStartupPrewarmResolution::Cancelled => { @@ -165,6 +181,11 @@ impl Session { let result = schedule_startup_prewarm_inner(startup_prewarm_session, base_instructions).await; let status = if result.is_ok() { "ready" } else { "failed" }; + session_telemetry.record_startup_phase( + "startup_prewarm_total", + started_at.elapsed(), + Some(status), + ); session_telemetry.record_duration( STARTUP_PREWARM_DURATION_METRIC, started_at.elapsed(), @@ -200,10 +221,17 @@ async fn schedule_startup_prewarm_inner( session: Arc, base_instructions: String, ) -> CodexResult { + let prewarm_started_at = Instant::now(); let startup_turn_context = session .new_default_turn_with_sub_id(INITIAL_SUBMIT_ID.to_owned()) .await; + startup_turn_context.session_telemetry.record_startup_phase( + "startup_prewarm_create_turn_context", + prewarm_started_at.elapsed(), + /*status*/ None, + ); let startup_cancellation_token = CancellationToken::new(); + let built_tools_started_at = Instant::now(); let startup_router = built_tools( session.as_ref(), startup_turn_context.as_ref(), @@ -213,6 +241,12 @@ async fn schedule_startup_prewarm_inner( &startup_cancellation_token, ) .await?; + startup_turn_context.session_telemetry.record_startup_phase( + "startup_prewarm_build_tools", + built_tools_started_at.elapsed(), + /*status*/ None, + ); + let build_prompt_started_at = Instant::now(); let startup_prompt = build_prompt( Vec::new(), startup_router.as_ref(), @@ -221,10 +255,16 @@ async fn schedule_startup_prewarm_inner( text: base_instructions, }, ); + startup_turn_context.session_telemetry.record_startup_phase( + "startup_prewarm_build_prompt", + build_prompt_started_at.elapsed(), + /*status*/ None, + ); let startup_turn_metadata_header = startup_turn_context .turn_metadata_state .current_header_value(); let mut client_session = session.services.model_client.new_session(); + let websocket_warmup_started_at = Instant::now(); client_session .prewarm_websocket( &startup_prompt, @@ -236,6 +276,11 @@ async fn schedule_startup_prewarm_inner( startup_turn_metadata_header.as_deref(), ) .await?; + startup_turn_context.session_telemetry.record_startup_phase( + "startup_prewarm_websocket_warmup", + websocket_warmup_started_at.elapsed(), + /*status*/ None, + ); Ok(client_session) } diff --git a/codex-rs/core/src/turn_timing.rs b/codex-rs/core/src/turn_timing.rs index 74c3c59d80..3d35afae95 100644 --- a/codex-rs/core/src/turn_timing.rs +++ b/codex-rs/core/src/turn_timing.rs @@ -4,7 +4,6 @@ use std::time::SystemTime; use std::time::UNIX_EPOCH; use codex_otel::TURN_TTFM_DURATION_METRIC; -use codex_otel::TURN_TTFT_DURATION_METRIC; use codex_protocol::items::TurnItem; use codex_protocol::models::ResponseItem; use tokio::sync::Mutex; @@ -21,9 +20,7 @@ pub(crate) async fn record_turn_ttft_metric(turn_context: &TurnContext, event: & else { return; }; - turn_context - .session_telemetry - .record_duration(TURN_TTFT_DURATION_METRIC, duration, &[]); + turn_context.session_telemetry.record_turn_ttft(duration); } pub(crate) async fn record_turn_ttfm_metric(turn_context: &TurnContext, item: &TurnItem) { diff --git a/codex-rs/otel/src/events/session_telemetry.rs b/codex-rs/otel/src/events/session_telemetry.rs index d6839d8882..e377419a6b 100644 --- a/codex-rs/otel/src/events/session_telemetry.rs +++ b/codex-rs/otel/src/events/session_telemetry.rs @@ -18,9 +18,11 @@ use crate::metrics::RESPONSES_API_OVERHEAD_DURATION_METRIC; use crate::metrics::Result as MetricsResult; use crate::metrics::SSE_EVENT_COUNT_METRIC; use crate::metrics::SSE_EVENT_DURATION_METRIC; +use crate::metrics::STARTUP_PHASE_DURATION_METRIC; use crate::metrics::SessionMetricTagValues; use crate::metrics::TOOL_CALL_COUNT_METRIC; use crate::metrics::TOOL_CALL_DURATION_METRIC; +use crate::metrics::TURN_TTFT_DURATION_METRIC; use crate::metrics::WEBSOCKET_EVENT_COUNT_METRIC; use crate::metrics::WEBSOCKET_EVENT_DURATION_METRIC; use crate::metrics::WEBSOCKET_REQUEST_COUNT_METRIC; @@ -183,6 +185,45 @@ impl SessionTelemetry { } } + /// Records a coarse startup phase for production latency breakdowns. + pub fn record_startup_phase( + &self, + phase: &'static str, + duration: Duration, + status: Option<&'static str>, + ) { + let tags = match status { + Some(status) => vec![("phase", phase), ("status", status)], + None => vec![("phase", phase)], + }; + self.record_duration(STARTUP_PHASE_DURATION_METRIC, duration, &tags); + log_and_trace_event!( + self, + common: { + event.name = "codex.startup_phase", + startup.phase = phase, + startup.status = status, + duration_ms = %duration.as_millis(), + }, + log: {}, + trace: {}, + ); + } + + /// Records time to first token as both a metric and a production telemetry event. + pub fn record_turn_ttft(&self, duration: Duration) { + self.record_duration(TURN_TTFT_DURATION_METRIC, duration, &[]); + log_and_trace_event!( + self, + common: { + event.name = "codex.turn_ttft", + duration_ms = %duration.as_millis(), + }, + log: {}, + trace: {}, + ); + } + pub fn start_timer(&self, name: &str, tags: &[(&str, &str)]) -> Result { let Some(metrics) = &self.metrics else { return Err(MetricsError::ExporterDisabled); diff --git a/codex-rs/otel/src/metrics/names.rs b/codex-rs/otel/src/metrics/names.rs index 67ac6e19ed..82fae06734 100644 --- a/codex-rs/otel/src/metrics/names.rs +++ b/codex-rs/otel/src/metrics/names.rs @@ -38,6 +38,8 @@ pub const CURATED_PLUGINS_STARTUP_SYNC_METRIC: &str = "codex.plugins.startup_syn pub const CURATED_PLUGINS_STARTUP_SYNC_FINAL_METRIC: &str = "codex.plugins.startup_sync.final"; pub const HOOK_RUN_METRIC: &str = "codex.hooks.run"; pub const HOOK_RUN_DURATION_METRIC: &str = "codex.hooks.run.duration_ms"; +/// Duration for coarse startup phases, tagged by low-cardinality phase and status. +pub const STARTUP_PHASE_DURATION_METRIC: &str = "codex.startup.phase.duration_ms"; /// Total runtime of a startup prewarm attempt until it completes, tagged by final status. pub const STARTUP_PREWARM_DURATION_METRIC: &str = "codex.startup_prewarm.duration_ms"; /// Age of the startup prewarm attempt when the first real turn resolves it, tagged by outcome.