mirror of
https://github.com/openai/codex.git
synced 2026-05-23 20:44:50 +00:00
fix(remote-control): cap reconnect backoff (#24164)
## Why Remote-control websocket reconnects currently use the shared exponential backoff helper without a local ceiling, so a long failure streak can stretch retries out indefinitely and leave the runtime behavior hard to inspect from logs. ## What Changed Cap the remote-control reconnect delay at 30 seconds, then reset the reconnect attempt counter once that capped delay is emitted so the next failure starts from the initial jittered delay again. The reconnect failure log now records the attempt number, chosen delay, and whether the cap triggered a reset, with a separate info log when the backoff counter is reset after the cap. ## Verification `just test -p codex-app-server-transport` Related issue: N/A
This commit is contained in:
@@ -64,6 +64,8 @@ const REMOTE_CONTROL_WEBSOCKET_PONG_TIMEOUT: std::time::Duration =
|
||||
std::time::Duration::from_secs(60);
|
||||
const REMOTE_CONTROL_ACCOUNT_ID_RETRY_INTERVAL: std::time::Duration =
|
||||
std::time::Duration::from_secs(1);
|
||||
const REMOTE_CONTROL_RECONNECT_BACKOFF_CAP: std::time::Duration =
|
||||
std::time::Duration::from_secs(30);
|
||||
|
||||
struct BoundedOutboundBuffer {
|
||||
buffer_by_stream: HashMap<(ClientId, StreamId), VecDeque<ServerEnvelope>>,
|
||||
@@ -514,12 +516,23 @@ impl RemoteControlWebsocket {
|
||||
} else {
|
||||
self.status_publisher
|
||||
.publish_status(RemoteControlConnectionStatus::Errored);
|
||||
let reconnect_attempt = self.reconnect_attempt.saturating_add(1);
|
||||
let (reconnect_delay, reconnect_backoff_reset) =
|
||||
next_reconnect_delay(&mut self.reconnect_attempt);
|
||||
warn!(
|
||||
"failed to connect to app-server remote control websocket: {}, err: {}",
|
||||
remote_control_target.websocket_url, err
|
||||
websocket_url = %remote_control_target.websocket_url,
|
||||
error = %err,
|
||||
reconnect_attempt,
|
||||
reconnect_delay = ?reconnect_delay,
|
||||
reconnect_backoff_reset,
|
||||
"failed to connect to app-server remote control websocket"
|
||||
);
|
||||
let reconnect_delay = backoff(self.reconnect_attempt);
|
||||
self.reconnect_attempt += 1;
|
||||
if reconnect_backoff_reset {
|
||||
info!(
|
||||
reconnect_backoff_cap = ?REMOTE_CONTROL_RECONNECT_BACKOFF_CAP,
|
||||
"reset app-server remote control websocket reconnect backoff after cap"
|
||||
);
|
||||
}
|
||||
reconnect_delay
|
||||
};
|
||||
tokio::select! {
|
||||
@@ -1036,6 +1049,17 @@ pub(crate) async fn load_remote_control_auth(
|
||||
})
|
||||
}
|
||||
|
||||
fn next_reconnect_delay(reconnect_attempt: &mut u64) -> (std::time::Duration, bool) {
|
||||
let reconnect_delay = backoff(*reconnect_attempt).min(REMOTE_CONTROL_RECONNECT_BACKOFF_CAP);
|
||||
let reconnect_backoff_reset = reconnect_delay == REMOTE_CONTROL_RECONNECT_BACKOFF_CAP;
|
||||
*reconnect_attempt = if reconnect_backoff_reset {
|
||||
0
|
||||
} else {
|
||||
(*reconnect_attempt).saturating_add(1)
|
||||
};
|
||||
(reconnect_delay, reconnect_backoff_reset)
|
||||
}
|
||||
|
||||
pub(super) async fn connect_remote_control_websocket(
|
||||
remote_control_target: &RemoteControlTarget,
|
||||
state_db: Option<&StateRuntime>,
|
||||
@@ -1326,6 +1350,26 @@ mod tests {
|
||||
const TEST_HTTP_ACCEPT_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
const TEST_INSTALLATION_ID: &str = "11111111-1111-4111-8111-111111111111";
|
||||
|
||||
#[test]
|
||||
fn next_reconnect_delay_resets_after_cap() {
|
||||
let mut reconnect_attempt = 9;
|
||||
|
||||
let (reconnect_delay, reconnect_backoff_reset) =
|
||||
next_reconnect_delay(&mut reconnect_attempt);
|
||||
|
||||
assert_eq!(reconnect_delay, REMOTE_CONTROL_RECONNECT_BACKOFF_CAP);
|
||||
assert!(reconnect_backoff_reset);
|
||||
assert_eq!(reconnect_attempt, 0);
|
||||
|
||||
let (reconnect_delay, reconnect_backoff_reset) =
|
||||
next_reconnect_delay(&mut reconnect_attempt);
|
||||
|
||||
assert!(reconnect_delay >= Duration::from_millis(180));
|
||||
assert!(reconnect_delay <= Duration::from_millis(220));
|
||||
assert!(!reconnect_backoff_reset);
|
||||
assert_eq!(reconnect_attempt, 1);
|
||||
}
|
||||
|
||||
fn remote_control_status_channel() -> (
|
||||
RemoteControlStatusPublisher,
|
||||
watch::Receiver<RemoteControlStatusChangedNotification>,
|
||||
|
||||
Reference in New Issue
Block a user