From 03e6c5f60048b81f443f3573b1578320c2189fbc Mon Sep 17 00:00:00 2001 From: Anton Panasenko Date: Fri, 22 May 2026 17:38:22 -0700 Subject: [PATCH] fix(remote-control): cap reconnect backoff (#24164) ## Why Remote-control websocket reconnects currently use the shared exponential backoff helper without a local ceiling, so a long failure streak can stretch retries out indefinitely and leave the runtime behavior hard to inspect from logs. ## What Changed Cap the remote-control reconnect delay at 30 seconds, then reset the reconnect attempt counter once that capped delay is emitted so the next failure starts from the initial jittered delay again. The reconnect failure log now records the attempt number, chosen delay, and whether the cap triggered a reset, with a separate info log when the backoff counter is reset after the cap. ## Verification `just test -p codex-app-server-transport` Related issue: N/A --- .../src/transport/remote_control/websocket.rs | 52 +++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/codex-rs/app-server-transport/src/transport/remote_control/websocket.rs b/codex-rs/app-server-transport/src/transport/remote_control/websocket.rs index 74ef9c2301..1e696c2e46 100644 --- a/codex-rs/app-server-transport/src/transport/remote_control/websocket.rs +++ b/codex-rs/app-server-transport/src/transport/remote_control/websocket.rs @@ -64,6 +64,8 @@ const REMOTE_CONTROL_WEBSOCKET_PONG_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60); const REMOTE_CONTROL_ACCOUNT_ID_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1); +const REMOTE_CONTROL_RECONNECT_BACKOFF_CAP: std::time::Duration = + std::time::Duration::from_secs(30); struct BoundedOutboundBuffer { buffer_by_stream: HashMap<(ClientId, StreamId), VecDeque>, @@ -514,12 +516,23 @@ impl RemoteControlWebsocket { } else { self.status_publisher .publish_status(RemoteControlConnectionStatus::Errored); + let reconnect_attempt = self.reconnect_attempt.saturating_add(1); + let (reconnect_delay, reconnect_backoff_reset) = + next_reconnect_delay(&mut self.reconnect_attempt); warn!( - "failed to connect to app-server remote control websocket: {}, err: {}", - remote_control_target.websocket_url, err + websocket_url = %remote_control_target.websocket_url, + error = %err, + reconnect_attempt, + reconnect_delay = ?reconnect_delay, + reconnect_backoff_reset, + "failed to connect to app-server remote control websocket" ); - let reconnect_delay = backoff(self.reconnect_attempt); - self.reconnect_attempt += 1; + if reconnect_backoff_reset { + info!( + reconnect_backoff_cap = ?REMOTE_CONTROL_RECONNECT_BACKOFF_CAP, + "reset app-server remote control websocket reconnect backoff after cap" + ); + } reconnect_delay }; tokio::select! { @@ -1036,6 +1049,17 @@ pub(crate) async fn load_remote_control_auth( }) } +fn next_reconnect_delay(reconnect_attempt: &mut u64) -> (std::time::Duration, bool) { + let reconnect_delay = backoff(*reconnect_attempt).min(REMOTE_CONTROL_RECONNECT_BACKOFF_CAP); + let reconnect_backoff_reset = reconnect_delay == REMOTE_CONTROL_RECONNECT_BACKOFF_CAP; + *reconnect_attempt = if reconnect_backoff_reset { + 0 + } else { + (*reconnect_attempt).saturating_add(1) + }; + (reconnect_delay, reconnect_backoff_reset) +} + pub(super) async fn connect_remote_control_websocket( remote_control_target: &RemoteControlTarget, state_db: Option<&StateRuntime>, @@ -1326,6 +1350,26 @@ mod tests { const TEST_HTTP_ACCEPT_TIMEOUT: Duration = Duration::from_secs(5); const TEST_INSTALLATION_ID: &str = "11111111-1111-4111-8111-111111111111"; + #[test] + fn next_reconnect_delay_resets_after_cap() { + let mut reconnect_attempt = 9; + + let (reconnect_delay, reconnect_backoff_reset) = + next_reconnect_delay(&mut reconnect_attempt); + + assert_eq!(reconnect_delay, REMOTE_CONTROL_RECONNECT_BACKOFF_CAP); + assert!(reconnect_backoff_reset); + assert_eq!(reconnect_attempt, 0); + + let (reconnect_delay, reconnect_backoff_reset) = + next_reconnect_delay(&mut reconnect_attempt); + + assert!(reconnect_delay >= Duration::from_millis(180)); + assert!(reconnect_delay <= Duration::from_millis(220)); + assert!(!reconnect_backoff_reset); + assert_eq!(reconnect_attempt, 1); + } + fn remote_control_status_channel() -> ( RemoteControlStatusPublisher, watch::Receiver,