fix(core) Filter non-matching prefix rules (#12314)

## Summary
`gpt-5.3-codex` really likes to write complicated shell scripts, and
suggest a partial prefix_rule that wouldn't actually approve the
command. We should only show the `prefix_rule` suggestion from the model
if it would actually fully approve the command the user is seeing.

This will technically cause more instances of overly-specific
suggestions when we fallback, but I think the UX is clearer,
particularly when the model doesn't necessarily understand the current
limitations of execpolicy parsing.

## Testing
 - [x] Add unit tests
 - [x] Add integration tests
This commit is contained in:
Dylan Hurd
2026-02-20 22:02:35 -08:00
committed by GitHub
parent 1779feb6a7
commit a8b4b569fb
2 changed files with 278 additions and 13 deletions

View File

@@ -184,6 +184,16 @@ fn shell_event(
command: &str,
timeout_ms: u64,
sandbox_permissions: SandboxPermissions,
) -> Result<Value> {
shell_event_with_prefix_rule(call_id, command, timeout_ms, sandbox_permissions, None)
}
fn shell_event_with_prefix_rule(
call_id: &str,
command: &str,
timeout_ms: u64,
sandbox_permissions: SandboxPermissions,
prefix_rule: Option<Vec<String>>,
) -> Result<Value> {
let mut args = json!({
"command": command,
@@ -192,6 +202,9 @@ fn shell_event(
if sandbox_permissions.requires_escalated_permissions() {
args["sandbox_permissions"] = json!(sandbox_permissions);
}
if let Some(prefix_rule) = prefix_rule {
args["prefix_rule"] = json!(prefix_rule);
}
let args_str = serde_json::to_string(&args)?;
Ok(ev_function_call(call_id, "shell_command", &args_str))
}
@@ -1928,6 +1941,171 @@ async fn approving_execpolicy_amendment_persists_policy_and_skips_future_prompts
Ok(())
}
#[tokio::test(flavor = "current_thread")]
#[cfg(unix)]
async fn invalid_requested_prefix_rule_falls_back_for_compound_command() -> Result<()> {
let server = start_mock_server().await;
let approval_policy = AskForApproval::OnRequest;
let sandbox_policy = SandboxPolicy::new_read_only_policy();
let sandbox_policy_for_config = sandbox_policy.clone();
let mut builder = test_codex().with_config(move |config| {
config.permissions.approval_policy = Constrained::allow_any(approval_policy);
config.permissions.sandbox_policy = Constrained::allow_any(sandbox_policy_for_config);
});
let test = builder.build(&server).await?;
let call_id = "invalid-prefix-rule";
let command =
"touch /tmp/codex-fallback-rule-test.txt && echo hello > /tmp/codex-fallback-rule-test.txt";
let event = shell_event_with_prefix_rule(
call_id,
command,
1_000,
SandboxPermissions::RequireEscalated,
Some(vec!["touch".to_string()]),
)?;
let _ = mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-invalid-prefix-1"),
event,
ev_completed("resp-invalid-prefix-1"),
]),
)
.await;
submit_turn(
&test,
"invalid-prefix-rule",
approval_policy,
sandbox_policy.clone(),
)
.await?;
let approval = expect_exec_approval(&test, command).await;
let amendment = approval
.proposed_execpolicy_amendment
.expect("should have a proposed execpolicy amendment");
assert!(amendment.command.contains(&command.to_string()));
Ok(())
}
#[tokio::test(flavor = "current_thread")]
#[cfg(unix)]
async fn approving_fallback_rule_for_compound_command_works() -> Result<()> {
let server = start_mock_server().await;
let approval_policy = AskForApproval::OnRequest;
let sandbox_policy = SandboxPolicy::new_read_only_policy();
let sandbox_policy_for_config = sandbox_policy.clone();
let mut builder = test_codex().with_config(move |config| {
config.permissions.approval_policy = Constrained::allow_any(approval_policy);
config.permissions.sandbox_policy = Constrained::allow_any(sandbox_policy_for_config);
});
let test = builder.build(&server).await?;
let call_id = "invalid-prefix-rule";
let command =
"touch /tmp/codex-fallback-rule-test.txt && echo hello > /tmp/codex-fallback-rule-test.txt";
let event = shell_event_with_prefix_rule(
call_id,
command,
1_000,
SandboxPermissions::RequireEscalated,
Some(vec!["touch".to_string()]),
)?;
let _ = mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-invalid-prefix-1"),
event,
ev_completed("resp-invalid-prefix-1"),
]),
)
.await;
submit_turn(
&test,
"invalid-prefix-rule",
approval_policy,
sandbox_policy.clone(),
)
.await?;
let approval = expect_exec_approval(&test, command).await;
let approval_id = approval.effective_approval_id();
let amendment = approval
.proposed_execpolicy_amendment
.expect("should have a proposed execpolicy amendment");
assert!(amendment.command.contains(&command.to_string()));
test.codex
.submit(Op::ExecApproval {
id: approval_id,
turn_id: None,
decision: ReviewDecision::ApprovedExecpolicyAmendment {
proposed_execpolicy_amendment: amendment.clone(),
},
})
.await?;
wait_for_completion(&test).await;
let call_id = "invalid-prefix-rule-again";
let command =
"touch /tmp/codex-fallback-rule-test.txt && echo hello > /tmp/codex-fallback-rule-test.txt";
let event = shell_event_with_prefix_rule(
call_id,
command,
1_000,
SandboxPermissions::RequireEscalated,
Some(vec!["touch".to_string()]),
)?;
let _ = mount_sse_once(
&server,
sse(vec![
ev_response_created("resp-invalid-prefix-1"),
event,
ev_completed("resp-invalid-prefix-1"),
]),
)
.await;
let second_results = mount_sse_once(
&server,
sse(vec![
ev_assistant_message("msg-invalid-prefix-1", "done"),
ev_completed("resp-invalid-prefix-2"),
]),
)
.await;
submit_turn(
&test,
"invalid-prefix-rule",
approval_policy,
sandbox_policy.clone(),
)
.await?;
wait_for_completion_without_approval(&test).await;
let second_output = parse_result(
&second_results
.single_request()
.function_call_output(call_id),
);
assert_eq!(second_output.exit_code.unwrap_or(0), 0);
assert!(
second_output.stdout.is_empty(),
"unexpected stdout: {}",
second_output.stdout
);
Ok(())
}
// todo(dylan) add ScenarioSpec support for rules
#[tokio::test(flavor = "current_thread")]
#[cfg(unix)]