Add guardian approval MVP (#13692)

## Summary
- add the guardian reviewer flow for `on-request` approvals in command,
patch, sandbox-retry, and managed-network approval paths
- keep guardian behind `features.guardian_approval` instead of exposing
a public `approval_policy = guardian` mode
- route ordinary `OnRequest` approvals to the guardian subagent when the
feature is enabled, without changing the public approval-mode surface

## Public model
- public approval modes stay unchanged
- guardian is enabled via `features.guardian_approval`
- when that feature is on, `approval_policy = on-request` keeps the same
approval boundaries but sends those approval requests to the guardian
reviewer instead of the user
- `/experimental` only persists the feature flag; it does not rewrite
`approval_policy`
- CLI and app-server no longer expose a separate `guardian` approval
mode in this PR

## Guardian reviewer
- the reviewer runs as a normal subagent and reuses the existing
subagent/thread machinery
- it is locked to a read-only sandbox and `approval_policy = never`
- it does not inherit user/project exec-policy rules
- it prefers `gpt-5.4` when the current provider exposes it, otherwise
falls back to the parent turn's active model
- it fail-closes on timeout, startup failure, malformed output, or any
other review error
- it currently auto-approves only when `risk_score < 80`

## Review context and policy
- guardian mirrors `OnRequest` approval semantics rather than
introducing a separate approval policy
- explicit `require_escalated` requests follow the same approval surface
as `OnRequest`; the difference is only who reviews them
- managed-network allowlist misses that enter the approval flow are also
reviewed by guardian
- the review prompt includes bounded recent transcript history plus
recent tool call/result evidence
- transcript entries and planned-action strings are truncated with
explicit `<guardian_truncated ... />` markers so large payloads stay
bounded
- apply-patch reviews include the full patch content (without
duplicating the structured `changes` payload)
- the guardian request layout is snapshot-tested using the same
model-visible Responses request formatter used elsewhere in core

## Guardian network behavior
- the guardian subagent inherits the parent session's managed-network
allowlist when one exists, so it can use the same approved network
surface while reviewing
- exact session-scoped network approvals are copied into the guardian
session with protocol/port scope preserved
- those copied approvals are now seeded before the guardian's first turn
is submitted, so inherited approvals are available during any immediate
review-time checks

## Out of scope / follow-ups
- the sandbox-permission validation split was pulled into a separate PR
and is not part of this diff
- a future follow-up can enable `serde_json` preserve-order in
`codex-core` and then simplify the guardian action rendering further

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Charley Cunningham
2026-03-07 05:40:10 -08:00
committed by GitHub
parent cf143bf71e
commit e84ee33cc0
34 changed files with 2477 additions and 139 deletions

View File

@@ -404,6 +404,8 @@ const APPROVAL_POLICY_ON_REQUEST_RULE: &str =
include_str!("prompts/permissions/approval_policy/on_request_rule.md");
const APPROVAL_POLICY_ON_REQUEST_RULE_REQUEST_PERMISSION: &str =
include_str!("prompts/permissions/approval_policy/on_request_rule_request_permission.md");
const GUARDIAN_APPROVAL_FEATURE: &str =
include_str!("prompts/permissions/approval_policy/guardian.md");
const SANDBOX_MODE_DANGER_FULL_ACCESS: &str =
include_str!("prompts/permissions/sandbox_mode/danger_full_access.md");
@@ -421,6 +423,7 @@ impl DeveloperInstructions {
pub fn from(
approval_policy: AskForApproval,
guardian_approval_enabled: bool,
exec_policy: &Policy,
request_permission_enabled: bool,
) -> DeveloperInstructions {
@@ -444,7 +447,14 @@ impl DeveloperInstructions {
AskForApproval::Never => APPROVAL_POLICY_NEVER.to_string(),
AskForApproval::UnlessTrusted => APPROVAL_POLICY_UNLESS_TRUSTED.to_string(),
AskForApproval::OnFailure => APPROVAL_POLICY_ON_FAILURE.to_string(),
AskForApproval::OnRequest => on_request_instructions(),
AskForApproval::OnRequest => {
let mut instructions = on_request_instructions();
if guardian_approval_enabled {
instructions.push_str("\n\n");
instructions.push_str(GUARDIAN_APPROVAL_FEATURE);
}
instructions
}
AskForApproval::Reject(reject_config) => {
let on_request_instructions = on_request_instructions();
let sandbox_approval = reject_config.sandbox_approval;
@@ -507,6 +517,7 @@ impl DeveloperInstructions {
pub fn from_policy(
sandbox_policy: &SandboxPolicy,
approval_policy: AskForApproval,
guardian_approval_enabled: bool,
exec_policy: &Policy,
cwd: &Path,
request_permission_enabled: bool,
@@ -531,6 +542,7 @@ impl DeveloperInstructions {
sandbox_mode,
network_access,
approval_policy,
guardian_approval_enabled,
exec_policy,
writable_roots,
request_permission_enabled,
@@ -555,6 +567,7 @@ impl DeveloperInstructions {
sandbox_mode: SandboxMode,
network_access: NetworkAccess,
approval_policy: AskForApproval,
guardian_approval_enabled: bool,
exec_policy: &Policy,
writable_roots: Option<Vec<WritableRoot>>,
request_permission_enabled: bool,
@@ -568,6 +581,7 @@ impl DeveloperInstructions {
))
.concat(DeveloperInstructions::from(
approval_policy,
guardian_approval_enabled,
exec_policy,
request_permission_enabled,
))
@@ -1625,6 +1639,7 @@ mod tests {
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
false,
&Policy::empty(),
None,
false,
@@ -1654,6 +1669,7 @@ mod tests {
let instructions = DeveloperInstructions::from_policy(
&policy,
AskForApproval::UnlessTrusted,
false,
&Policy::empty(),
&PathBuf::from("/tmp"),
false,
@@ -1676,6 +1692,7 @@ mod tests {
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
false,
&exec_policy,
None,
false,
@@ -1693,6 +1710,7 @@ mod tests {
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
false,
&Policy::empty(),
None,
true,
@@ -1703,6 +1721,23 @@ mod tests {
assert!(text.contains("additional_permissions"));
}
#[test]
fn includes_guardian_feature_guidance_for_on_request_when_enabled() {
let instructions = DeveloperInstructions::from_permissions_with_network(
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
true,
&Policy::empty(),
None,
false,
);
let text = instructions.into_text();
assert!(text.contains("guardian subagent"));
assert!(text.contains("approval prompts"));
}
#[test]
fn render_command_prefix_list_sorts_by_len_then_total_len_then_alphabetical() {
let prefixes = vec![