Add guardian approval MVP (#13692)

## Summary
- add the guardian reviewer flow for `on-request` approvals in command,
patch, sandbox-retry, and managed-network approval paths
- keep guardian behind `features.guardian_approval` instead of exposing
a public `approval_policy = guardian` mode
- route ordinary `OnRequest` approvals to the guardian subagent when the
feature is enabled, without changing the public approval-mode surface

## Public model
- public approval modes stay unchanged
- guardian is enabled via `features.guardian_approval`
- when that feature is on, `approval_policy = on-request` keeps the same
approval boundaries but sends those approval requests to the guardian
reviewer instead of the user
- `/experimental` only persists the feature flag; it does not rewrite
`approval_policy`
- CLI and app-server no longer expose a separate `guardian` approval
mode in this PR

## Guardian reviewer
- the reviewer runs as a normal subagent and reuses the existing
subagent/thread machinery
- it is locked to a read-only sandbox and `approval_policy = never`
- it does not inherit user/project exec-policy rules
- it prefers `gpt-5.4` when the current provider exposes it, otherwise
falls back to the parent turn's active model
- it fail-closes on timeout, startup failure, malformed output, or any
other review error
- it currently auto-approves only when `risk_score < 80`

## Review context and policy
- guardian mirrors `OnRequest` approval semantics rather than
introducing a separate approval policy
- explicit `require_escalated` requests follow the same approval surface
as `OnRequest`; the difference is only who reviews them
- managed-network allowlist misses that enter the approval flow are also
reviewed by guardian
- the review prompt includes bounded recent transcript history plus
recent tool call/result evidence
- transcript entries and planned-action strings are truncated with
explicit `<guardian_truncated ... />` markers so large payloads stay
bounded
- apply-patch reviews include the full patch content (without
duplicating the structured `changes` payload)
- the guardian request layout is snapshot-tested using the same
model-visible Responses request formatter used elsewhere in core

## Guardian network behavior
- the guardian subagent inherits the parent session's managed-network
allowlist when one exists, so it can use the same approved network
surface while reviewing
- exact session-scoped network approvals are copied into the guardian
session with protocol/port scope preserved
- those copied approvals are now seeded before the guardian's first turn
is submitted, so inherited approvals are available during any immediate
review-time checks

## Out of scope / follow-ups
- the sandbox-permission validation split was pulled into a separate PR
and is not part of this diff
- a future follow-up can enable `serde_json` preserve-order in
`codex-core` and then simplify the guardian action rendering further

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Charley Cunningham
2026-03-07 05:40:10 -08:00
committed by GitHub
parent cf143bf71e
commit e84ee33cc0
34 changed files with 2477 additions and 139 deletions

View File

@@ -0,0 +1,272 @@
use super::*;
use crate::config_loader::ConfigLayerEntry;
use crate::config_loader::ConfigRequirements;
use crate::config_loader::ConfigRequirementsToml;
use crate::exec::ExecParams;
use crate::exec_policy::ExecPolicyManager;
use crate::features::Feature;
use crate::guardian::GUARDIAN_SUBAGENT_NAME;
use crate::protocol::AskForApproval;
use crate::sandboxing::SandboxPermissions;
use crate::turn_diff_tracker::TurnDiffTracker;
use codex_app_server_protocol::ConfigLayerSource;
use codex_execpolicy::Decision;
use codex_execpolicy::Evaluation;
use codex_execpolicy::RuleMatch;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::NetworkPermissions;
use codex_protocol::models::PermissionProfile;
use codex_utils_absolute_path::AbsolutePathBuf;
use pretty_assertions::assert_eq;
use serde::Deserialize;
use std::collections::HashMap;
use std::fs;
use std::sync::Arc;
use tempfile::tempdir;
#[tokio::test]
async fn guardian_allows_shell_additional_permissions_requests_past_policy_validation() {
let (mut session, mut turn_context_raw) = make_session_and_context().await;
turn_context_raw
.approval_policy
.set(AskForApproval::OnRequest)
.expect("test setup should allow updating approval policy");
turn_context_raw
.features
.enable(Feature::GuardianApproval)
.expect("test setup should allow enabling guardian approvals");
session
.features
.enable(Feature::RequestPermissions)
.expect("test setup should allow enabling request permissions");
turn_context_raw
.sandbox_policy
.set(SandboxPolicy::DangerFullAccess)
.expect("test setup should allow updating sandbox policy");
let session = Arc::new(session);
let turn_context = Arc::new(turn_context_raw);
let params = ExecParams {
command: if cfg!(windows) {
vec![
"cmd.exe".to_string(),
"/C".to_string(),
"echo hi".to_string(),
]
} else {
vec![
"/bin/sh".to_string(),
"-c".to_string(),
"echo hi".to_string(),
]
},
cwd: turn_context.cwd.clone(),
expiration: 1000.into(),
env: HashMap::new(),
network: None,
sandbox_permissions: SandboxPermissions::WithAdditionalPermissions,
windows_sandbox_level: turn_context.windows_sandbox_level,
justification: Some("test".to_string()),
arg0: None,
};
let handler = ShellHandler;
let resp = handler
.handle(ToolInvocation {
session: Arc::clone(&session),
turn: Arc::clone(&turn_context),
tracker: Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new())),
call_id: "test-call".to_string(),
tool_name: "shell".to_string(),
payload: ToolPayload::Function {
arguments: serde_json::json!({
"command": params.command.clone(),
"workdir": Some(turn_context.cwd.to_string_lossy().to_string()),
"timeout_ms": params.expiration.timeout_ms(),
"sandbox_permissions": params.sandbox_permissions,
"additional_permissions": PermissionProfile {
network: Some(NetworkPermissions {
enabled: Some(true),
}),
file_system: None,
macos: None,
},
"justification": params.justification.clone(),
})
.to_string(),
},
})
.await;
let output = match resp.expect("expected Ok result") {
ToolOutput::Function {
body: FunctionCallOutputBody::Text(content),
..
} => content,
_ => panic!("unexpected tool output"),
};
#[derive(Deserialize, PartialEq, Eq, Debug)]
struct ResponseExecMetadata {
exit_code: i32,
}
#[derive(Deserialize)]
struct ResponseExecOutput {
output: String,
metadata: ResponseExecMetadata,
}
let exec_output: ResponseExecOutput =
serde_json::from_str(&output).expect("valid exec output json");
assert_eq!(exec_output.metadata, ResponseExecMetadata { exit_code: 0 });
assert!(exec_output.output.contains("hi"));
}
#[tokio::test]
async fn guardian_allows_unified_exec_additional_permissions_requests_past_policy_validation() {
let (mut session, mut turn_context_raw) = make_session_and_context().await;
turn_context_raw
.approval_policy
.set(AskForApproval::OnRequest)
.expect("test setup should allow updating approval policy");
turn_context_raw
.features
.enable(Feature::GuardianApproval)
.expect("test setup should allow enabling guardian approvals");
session
.features
.enable(Feature::RequestPermissions)
.expect("test setup should allow enabling request permissions");
let session = Arc::new(session);
let turn_context = Arc::new(turn_context_raw);
let tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new()));
let handler = UnifiedExecHandler;
let resp = handler
.handle(ToolInvocation {
session: Arc::clone(&session),
turn: Arc::clone(&turn_context),
tracker: Arc::clone(&tracker),
call_id: "exec-call".to_string(),
tool_name: "exec_command".to_string(),
payload: ToolPayload::Function {
arguments: serde_json::json!({
"cmd": "echo hi",
"sandbox_permissions": SandboxPermissions::WithAdditionalPermissions,
"justification": "need additional sandbox permissions",
})
.to_string(),
},
})
.await;
let Err(FunctionCallError::RespondToModel(output)) = resp else {
panic!("expected validation error result");
};
assert_eq!(
output,
"missing `additional_permissions`; provide at least one of `network`, `file_system`, or `macos` when using `with_additional_permissions`"
);
}
#[tokio::test]
async fn guardian_subagent_does_not_inherit_parent_exec_policy_rules() {
let codex_home = tempdir().expect("create codex home");
let project_dir = tempdir().expect("create project dir");
let rules_dir = project_dir.path().join("rules");
fs::create_dir_all(&rules_dir).expect("create rules dir");
fs::write(
rules_dir.join("deny.rules"),
r#"prefix_rule(pattern=["rm"], decision="forbidden")"#,
)
.expect("write policy file");
let mut config = build_test_config(codex_home.path()).await;
config.cwd = project_dir.path().to_path_buf();
config.config_layer_stack = ConfigLayerStack::new(
vec![ConfigLayerEntry::new(
ConfigLayerSource::Project {
dot_codex_folder: AbsolutePathBuf::from_absolute_path(project_dir.path())
.expect("absolute project path"),
},
toml::Value::Table(Default::default()),
)],
ConfigRequirements::default(),
ConfigRequirementsToml::default(),
)
.expect("config layer stack");
let command = [vec!["rm".to_string()]];
let parent_exec_policy = ExecPolicyManager::load(&config.config_layer_stack)
.await
.expect("load parent exec policy");
assert_eq!(
parent_exec_policy
.current()
.check_multiple(command.iter(), &|_| Decision::Allow),
Evaluation {
decision: Decision::Forbidden,
matched_rules: vec![RuleMatch::PrefixRuleMatch {
matched_prefix: vec!["rm".to_string()],
decision: Decision::Forbidden,
resolved_program: None,
justification: None,
}],
}
);
let auth_manager = AuthManager::from_auth_for_testing(CodexAuth::from_api_key("Test API Key"));
let models_manager = Arc::new(ModelsManager::new(
config.codex_home.clone(),
auth_manager.clone(),
None,
CollaborationModesConfig::default(),
));
let plugins_manager = Arc::new(PluginsManager::new(config.codex_home.clone()));
let skills_manager = Arc::new(SkillsManager::new(
config.codex_home.clone(),
Arc::clone(&plugins_manager),
));
let mcp_manager = Arc::new(McpManager::new(Arc::clone(&plugins_manager)));
let file_watcher = Arc::new(FileWatcher::noop());
let CodexSpawnOk { codex, .. } = Codex::spawn(
config,
auth_manager,
models_manager,
skills_manager,
plugins_manager,
mcp_manager,
file_watcher,
InitialHistory::New,
SessionSource::SubAgent(SubAgentSource::Other(GUARDIAN_SUBAGENT_NAME.to_string())),
AgentControl::default(),
Vec::new(),
false,
None,
None,
)
.await
.expect("spawn guardian subagent");
assert_eq!(
codex
.session
.services
.exec_policy
.current()
.check_multiple(command.iter(), &|_| Decision::Allow),
Evaluation {
decision: Decision::Allow,
matched_rules: vec![RuleMatch::HeuristicsRuleMatch {
command: vec!["rm".to_string()],
decision: Decision::Allow,
}],
}
);
drop(codex);
}