chore: stop app-server auth refresh storms after permanent token failure (#15530)

built from #14256. PR description from @etraut-openai: This PR addresses a hole in [PR 11802](https://github.com/openai/codex/pull/11802). The previous PR assumed that app server clients would respond to token refresh failures by presenting the user with an error ("you must log in again") and then not making further attempts to call network endpoints using the expired token. While they do present the user with this error, they don't prevent further attempts to call network endpoints and can repeatedly call `getAuthStatus(refreshToken=true)` resulting in many failed calls to the token refresh endpoint. There are three solutions I considered here: 1. Change the getAuthStatus app server call to return a null auth if the caller specified "refreshToken" on input and the refresh attempt fails. This will cause clients to immediately log out the user and return them to the log in screen. This is a really bad user experience. It's also a breaking change in the app server contract that could break third-party clients. 2. Augment the getAuthStatus app server call to return an additional field that indicates the state of "token could not be refreshed". This is a non-breaking change to the app server API, but it requires non-trivial changes for all clients to properly handle this new field properly. 3. Change the getAuthStatus implementation to handle the case where a token refresh fails by marking the AuthManager's in-memory access and refresh tokens as "poisoned" so it they are no longer used. This is the simplest fix that requires no client changes. I chose option 3. Here's Codex's explanation of this change: When an app-server client asks `getAuthStatus(refreshToken=true)`, we may try to refresh a stale ChatGPT access token. If that refresh fails permanently (for example `refresh_token_reused`, expired, or revoked), the old behavior was bad in two ways: 1. We kept the in-memory auth snapshot alive as if it were still usable. 2. Later auth checks could retry refresh again and again, creating a storm of doomed `/oauth/token` requests and repeatedly surfacing the same failure. This is especially painful for app-server clients because they poll auth status and can keep driving the refresh path without any real chance of recovery. This change makes permanent refresh failures terminal for the current managed auth snapshot without changing the app-server API contract. What changed: - `AuthManager` now poisons the current managed auth snapshot in memory after a permanent refresh failure, keyed to the unchanged `AuthDotJson`. - Once poisoned, later refresh attempts for that same snapshot fail fast locally without calling the auth service again. - The poison is cleared automatically when auth materially changes, such as a new login, logout, or reload of different auth state from storage. - `getAuthStatus(includeToken=true)` now omits `authToken` after a permanent refresh failure instead of handing out the stale cached bearer token. This keeps the current auth method visible to clients, avoids forcing an immediate logout flow, and stops repeated refresh attempts for credentials that cannot recover. --------- Co-authored-by: Eric Traut <etraut@openai.com>
2026-04-29 08:56:38 +00:00 · 2026-03-24 12:39:58 -07:00
parent 7dc2cd2ebe
commit 88694e8417
5 changed files with 587 additions and 17 deletions
--- a/codex-rs/core/tests/suite/auth_refresh.rs
+++ b/codex-rs/core/tests/suite/auth_refresh.rs
@@ -545,6 +545,153 @@ async fn refresh_token_returns_permanent_error_for_expired_refresh_token() -> Re
    Ok(())
 }

+#[serial_test::serial(auth_refresh)]
+#[tokio::test]
+async fn refresh_token_does_not_retry_after_permanent_failure() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/oauth/token"))
+        .respond_with(ResponseTemplate::new(401).set_body_json(json!({
+            "error": {
+                "code": "refresh_token_reused"
+            }
+        })))
+        .expect(1)
+        .mount(&server)
+        .await;
+
+    let ctx = RefreshTokenTestContext::new(&server)?;
+    let initial_last_refresh = Utc::now() - Duration::days(1);
+    let initial_tokens = build_tokens(INITIAL_ACCESS_TOKEN, INITIAL_REFRESH_TOKEN);
+    let initial_auth = AuthDotJson {
+        auth_mode: Some(AuthMode::Chatgpt),
+        openai_api_key: None,
+        tokens: Some(initial_tokens.clone()),
+        last_refresh: Some(initial_last_refresh),
+    };
+    ctx.write_auth(&initial_auth)?;
+
+    let first_err = ctx
+        .auth_manager
+        .refresh_token()
+        .await
+        .err()
+        .context("first refresh should fail")?;
+    assert_eq!(
+        first_err.failed_reason(),
+        Some(RefreshTokenFailedReason::Exhausted)
+    );
+
+    let second_err = ctx
+        .auth_manager
+        .refresh_token()
+        .await
+        .err()
+        .context("second refresh should fail without retrying")?;
+    assert_eq!(
+        second_err.failed_reason(),
+        Some(RefreshTokenFailedReason::Exhausted)
+    );
+
+    let stored = ctx.load_auth()?;
+    assert_eq!(stored, initial_auth);
+    let cached_auth = ctx
+        .auth_manager
+        .auth()
+        .await
+        .context("auth should remain cached")?;
+    let cached = cached_auth
+        .get_token_data()
+        .context("token data should remain cached")?;
+    assert_eq!(cached, initial_tokens);
+
+    server.verify().await;
+    Ok(())
+}
+
+#[serial_test::serial(auth_refresh)]
+#[tokio::test]
+async fn refresh_token_reloads_changed_auth_after_permanent_failure() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/oauth/token"))
+        .respond_with(ResponseTemplate::new(401).set_body_json(json!({
+            "error": {
+                "code": "refresh_token_reused"
+            }
+        })))
+        .expect(1)
+        .mount(&server)
+        .await;
+
+    let ctx = RefreshTokenTestContext::new(&server)?;
+    let initial_last_refresh = Utc::now() - Duration::days(1);
+    let initial_tokens = build_tokens(INITIAL_ACCESS_TOKEN, INITIAL_REFRESH_TOKEN);
+    let initial_auth = AuthDotJson {
+        auth_mode: Some(AuthMode::Chatgpt),
+        openai_api_key: None,
+        tokens: Some(initial_tokens.clone()),
+        last_refresh: Some(initial_last_refresh),
+    };
+    ctx.write_auth(&initial_auth)?;
+
+    let first_err = ctx
+        .auth_manager
+        .refresh_token()
+        .await
+        .err()
+        .context("first refresh should fail")?;
+    assert_eq!(
+        first_err.failed_reason(),
+        Some(RefreshTokenFailedReason::Exhausted)
+    );
+
+    let fresh_refresh = Utc::now() - Duration::hours(1);
+    let disk_tokens = build_tokens("disk-access-token", "disk-refresh-token");
+    let disk_auth = AuthDotJson {
+        auth_mode: Some(AuthMode::Chatgpt),
+        openai_api_key: None,
+        tokens: Some(disk_tokens.clone()),
+        last_refresh: Some(fresh_refresh),
+    };
+    save_auth(
+        ctx.codex_home.path(),
+        &disk_auth,
+        AuthCredentialsStoreMode::File,
+    )?;
+
+    ctx.auth_manager
+        .refresh_token()
+        .await
+        .context("refresh should reload changed auth without retrying")?;
+
+    let stored = ctx.load_auth()?;
+    assert_eq!(stored, disk_auth);
+
+    let cached_auth = ctx
+        .auth_manager
+        .auth_cached()
+        .context("auth should be cached")?;
+    let cached = cached_auth
+        .get_token_data()
+        .context("token data should reload from disk")?;
+    assert_eq!(cached, disk_tokens);
+
+    let requests = server.received_requests().await.unwrap_or_default();
+    assert_eq!(
+        requests.len(),
+        1,
+        "expected only the initial refresh request"
+    );
+
+    server.verify().await;
+    Ok(())
+}
+
 #[serial_test::serial(auth_refresh)]
 #[tokio::test]
 async fn refresh_token_returns_transient_error_on_server_failure() -> Result<()> {