Make cloud_requirements fail close (#13063)

Make it fail-close only for CLI for now
Will extend this for app-server later
This commit is contained in:
alexsong-oai
2026-02-27 18:22:05 -08:00
committed by GitHub
parent e6032eb0b7
commit e2fef7a3d2
6 changed files with 158 additions and 65 deletions

View File

@@ -4,9 +4,9 @@
//! from the local filesystem. It only applies to Business (aka Enterprise CBP) or Enterprise ChatGPT
//! customers.
//!
//! Today, fetching is best-effort: on error or timeout, Codex continues without cloud requirements.
//! We expect to tighten this so that Enterprise ChatGPT customers must successfully fetch these
//! requirements before Codex will run.
//! Fetching fails closed for eligible ChatGPT Business and Enterprise accounts. When cloud
//! requirements cannot be loaded for those accounts, Codex fails configuration loading rather than
//! continuing without them.
use async_trait::async_trait;
use base64::Engine;
@@ -17,6 +17,7 @@ use chrono::Utc;
use codex_backend_client::Client as BackendClient;
use codex_core::AuthManager;
use codex_core::auth::CodexAuth;
use codex_core::config_loader::CloudRequirementsLoadError;
use codex_core::config_loader::CloudRequirementsLoader;
use codex_core::config_loader::ConfigRequirementsToml;
use codex_core::util::backoff;
@@ -220,16 +221,34 @@ impl CloudRequirementsService {
}
}
async fn fetch_with_timeout(&self) -> Option<ConfigRequirementsToml> {
async fn fetch_with_timeout(
&self,
) -> Result<Option<ConfigRequirementsToml>, CloudRequirementsLoadError> {
let _timer =
codex_otel::start_global_timer("codex.cloud_requirements.fetch.duration_ms", &[]);
let started_at = Instant::now();
let result = timeout(self.timeout, self.fetch())
.await
.inspect_err(|_| {
tracing::warn!("Timed out waiting for cloud requirements; continuing without them");
let message = format!(
"Timed out waiting for cloud requirements after {}s",
self.timeout.as_secs()
);
tracing::error!("{message}");
if let Some(metrics) = codex_otel::metrics::global() {
let _ = metrics.counter(
"codex.cloud_requirements.load_failure",
1,
&[("trigger", "startup")],
);
}
})
.ok()?;
.map_err(|_| {
CloudRequirementsLoadError::new(format!(
"timed out waiting for cloud requirements after {}s",
self.timeout.as_secs()
))
})??;
match result.as_ref() {
Some(requirements) => {
@@ -247,18 +266,20 @@ impl CloudRequirementsService {
}
}
result
Ok(result)
}
async fn fetch(&self) -> Option<ConfigRequirementsToml> {
let auth = self.auth_manager.auth().await?;
async fn fetch(&self) -> Result<Option<ConfigRequirementsToml>, CloudRequirementsLoadError> {
let Some(auth) = self.auth_manager.auth().await else {
return Ok(None);
};
if !auth.is_chatgpt_auth()
|| !matches!(
auth.account_plan_type(),
Some(PlanType::Business | PlanType::Enterprise)
)
{
return None;
return Ok(None);
}
let token_data = auth.get_token_data().ok();
let chatgpt_user_id = token_data
@@ -273,7 +294,7 @@ impl CloudRequirementsService {
path = %self.cache_path.display(),
"Using cached cloud requirements"
);
return signed_payload.requirements();
return Ok(signed_payload.requirements());
}
Err(cache_load_status) => {
self.log_cache_load_status(&cache_load_status);
@@ -281,7 +302,15 @@ impl CloudRequirementsService {
}
self.fetch_with_retries(&auth, chatgpt_user_id, account_id)
.await?
.await
.ok_or_else(|| {
let message = "failed to load your workspace-managed config";
tracing::error!(
path = %self.cache_path.display(),
"{message}"
);
CloudRequirementsLoadError::new(message)
})
}
async fn fetch_with_retries(
@@ -311,7 +340,7 @@ impl CloudRequirementsService {
Some(contents) => match parse_cloud_requirements(contents) {
Ok(requirements) => requirements,
Err(err) => {
tracing::warn!(error = %err, "Failed to parse cloud requirements");
tracing::error!(error = %err, "Failed to parse cloud requirements");
return None;
}
},
@@ -342,7 +371,7 @@ impl CloudRequirementsService {
Ok(true) => {}
Ok(false) => break,
Err(_) => {
tracing::warn!(
tracing::error!(
"Timed out refreshing cloud requirements cache from remote; keeping existing cache"
);
}
@@ -375,10 +404,17 @@ impl CloudRequirementsService {
.await
.is_none()
{
tracing::warn!(
tracing::error!(
path = %self.cache_path.display(),
"Failed to refresh cloud requirements cache from remote"
);
if let Some(metrics) = codex_otel::metrics::global() {
let _ = metrics.counter(
"codex.cloud_requirements.load_failure",
1,
&[("trigger", "refresh")],
);
}
}
true
}
@@ -522,10 +558,10 @@ pub fn cloud_requirements_loader(
existing_task.abort();
}
CloudRequirementsLoader::new(async move {
task.await
.inspect_err(|err| tracing::warn!(error = %err, "Cloud requirements task failed"))
.ok()
.flatten()
task.await.map_err(|err| {
tracing::error!(error = %err, "Cloud requirements task failed");
CloudRequirementsLoadError::new(format!("cloud requirements load failed: {err}"))
})?
})
}
@@ -692,7 +728,7 @@ mod tests {
CLOUD_REQUIREMENTS_TIMEOUT,
);
let result = service.fetch().await;
assert!(result.is_none());
assert_eq!(result, Ok(None));
}
#[tokio::test]
@@ -705,7 +741,7 @@ mod tests {
CLOUD_REQUIREMENTS_TIMEOUT,
);
let result = service.fetch().await;
assert!(result.is_none());
assert_eq!(result, Ok(None));
}
#[tokio::test]
@@ -721,7 +757,7 @@ mod tests {
);
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -729,7 +765,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
}
@@ -789,7 +825,11 @@ mod tests {
tokio::time::advance(CLOUD_REQUIREMENTS_TIMEOUT + Duration::from_millis(1)).await;
let result = handle.await.expect("cloud requirements task");
assert!(result.is_none());
let err = result.expect_err("cloud requirements timeout should fail closed");
assert!(
err.to_string()
.contains("timed out waiting for cloud requirements")
);
}
#[tokio::test(start_paused = true)]
@@ -812,7 +852,7 @@ mod tests {
assert_eq!(
handle.await.expect("cloud requirements task"),
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -820,7 +860,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 2);
}
@@ -839,7 +879,7 @@ mod tests {
CLOUD_REQUIREMENTS_TIMEOUT,
);
assert!(service.fetch().await.is_none());
assert!(service.fetch().await.is_err());
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 1);
}
@@ -868,7 +908,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -876,7 +916,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 0);
}
@@ -895,7 +935,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -903,7 +943,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
let path = codex_home.path().join(CLOUD_REQUIREMENTS_CACHE_FILENAME);
@@ -942,7 +982,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::OnRequest]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -950,7 +990,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 1);
}
@@ -988,7 +1028,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::OnRequest]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -996,7 +1036,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 1);
}
@@ -1038,7 +1078,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -1046,7 +1086,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 1);
}
@@ -1089,7 +1129,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -1097,7 +1137,7 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 1);
}
@@ -1171,7 +1211,7 @@ mod tests {
CLOUD_REQUIREMENTS_TIMEOUT,
);
assert!(service.fetch().await.is_none());
assert_eq!(service.fetch().await, Ok(None));
assert_eq!(fetcher.request_count.load(Ordering::SeqCst), 1);
}
@@ -1196,7 +1236,14 @@ mod tests {
tokio::time::advance(Duration::from_secs(5)).await;
tokio::task::yield_now().await;
assert!(handle.await.expect("cloud requirements task").is_none());
let err = handle
.await
.expect("cloud requirements task")
.expect_err("cloud requirements retry exhaustion should fail closed");
assert_eq!(
err.to_string(),
"failed to load your workspace-managed config"
);
assert_eq!(
fetcher.request_count.load(Ordering::SeqCst),
CLOUD_REQUIREMENTS_MAX_ATTEMPTS
@@ -1221,7 +1268,7 @@ mod tests {
assert_eq!(
service.fetch().await,
Some(ConfigRequirementsToml {
Ok(Some(ConfigRequirementsToml {
allowed_approval_policies: Some(vec![AskForApproval::Never]),
allowed_sandbox_modes: None,
allowed_web_search_modes: None,
@@ -1229,10 +1276,10 @@ mod tests {
rules: None,
enforce_residency: None,
network: None,
})
}))
);
service.refresh_cache().await;
assert!(service.refresh_cache().await);
let path = codex_home.path().join(CLOUD_REQUIREMENTS_CACHE_FILENAME);
let cache_file: CloudRequirementsCacheFile =