fix: preserve split filesystem semantics in linux sandbox (#14173)

## Stack

   fix: fail closed for unsupported split windows sandboxing #14172
-> fix: preserve split filesystem semantics in linux sandbox #14173
   fix: align core approvals with split sandbox policies #14171
   refactor: centralize filesystem permissions precedence #14174

## Summary
## Summary
- Preserve Linux split filesystem carveouts in bubblewrap by applying
mount masks in the right order, so narrower rules still win under
broader writable roots.
- Preserve unreadable ancestors of writable roots by masking them first
and then rebinding the narrower writable descendants.
- Stop rejecting legacy-plus-split Linux configs that are
sandbox-equivalent after `cwd` resolution by comparing semantics instead
of raw legacy structs.
- Fail closed when callers provide partial split policies, mismatched
legacy-plus-split policies, or force `--use-legacy-landlock` for
split-only shapes that legacy Landlock cannot enforce.
- Add Linux regressions for overlapping writable, read-only, and denied
paths, and document the supported split-policy enforcement path.

## Example
Given a split filesystem policy like:

```toml
[permissions.dev.filesystem]
":root" = "read"
"/code" = "write"
"/code/.git" = "read"
"/code/secrets" = "none"
"/code/secrets/tmp" = "write"
```

this PR makes Linux enforce the intended result under bubblewrap:

- `/code` stays writable
- `/code/.git` stays read-only
- `/code/secrets` stays denied
- `/code/secrets/tmp` can still be reopened as writable if explicitly
allowed

Before this, Linux could lose one of those carveouts depending on mount
order or legacy-policy fallback. This PR keeps the split-policy
semantics intact and rejects configurations that legacy Landlock cannot
represent safely.
This commit is contained in:
viyatb-oai
2026-03-12 10:56:32 -07:00
committed by GitHub
parent 4e99c0f179
commit 774965f1e8
4 changed files with 745 additions and 110 deletions

View File

@@ -10,6 +10,7 @@
//! - seccomp + `PR_SET_NO_NEW_PRIVS` applied in-process, and
//! - bubblewrap used to construct the filesystem view before exec.
use std::collections::BTreeSet;
use std::collections::HashSet;
use std::fs::File;
use std::os::fd::AsRawFd;
use std::path::Path;
@@ -182,12 +183,14 @@ fn create_bwrap_flags(
/// `--tmpfs /` and layer scoped `--ro-bind` mounts.
/// 2. `--dev /dev` mounts a minimal writable `/dev` with standard device nodes
/// (including `/dev/urandom`) even under a read-only root.
/// 3. `--bind <root> <root>` re-enables writes for allowed roots, including
/// 3. Unreadable ancestors of writable roots are masked first so narrower
/// writable descendants can be rebound afterward.
/// 4. `--bind <root> <root>` re-enables writes for allowed roots, including
/// writable subpaths under `/dev` (for example, `/dev/shm`).
/// 4. `--ro-bind <subpath> <subpath>` re-applies read-only protections under
/// 5. `--ro-bind <subpath> <subpath>` re-applies read-only protections under
/// those writable roots so protected subpaths win.
/// 5. Explicit unreadable roots are masked last so deny carveouts still win
/// even when the readable baseline includes `/`.
/// 6. Remaining explicit unreadable roots are masked last so deny carveouts
/// still win even when the readable baseline includes `/`.
fn create_filesystem_args(
file_system_sandbox_policy: &FileSystemSandboxPolicy,
cwd: &Path,
@@ -258,81 +261,98 @@ fn create_filesystem_args(
args
};
let mut preserved_files = Vec::new();
for writable_root in &writable_roots {
let root = writable_root.root.as_path();
args.push("--bind".to_string());
args.push(path_to_string(root));
args.push(path_to_string(root));
}
// Re-apply read-only subpaths after the writable binds so they win.
let allowed_write_paths: Vec<PathBuf> = writable_roots
.iter()
.map(|writable_root| writable_root.root.as_path().to_path_buf())
.collect();
for subpath in collect_read_only_subpaths(&writable_roots) {
if let Some(symlink_path) = find_symlink_in_path(&subpath, &allowed_write_paths) {
args.push("--ro-bind".to_string());
args.push("/dev/null".to_string());
args.push(path_to_string(&symlink_path));
continue;
let unreadable_paths: HashSet<PathBuf> = unreadable_roots
.iter()
.map(|path| path.as_path().to_path_buf())
.collect();
let mut sorted_writable_roots = writable_roots;
sorted_writable_roots.sort_by_key(|writable_root| path_depth(writable_root.root.as_path()));
let mut unreadable_ancestors_of_writable_roots: Vec<PathBuf> = unreadable_roots
.iter()
.filter(|path| {
let unreadable_root = path.as_path();
!allowed_write_paths
.iter()
.any(|root| unreadable_root.starts_with(root))
&& allowed_write_paths
.iter()
.any(|root| root.starts_with(unreadable_root))
})
.map(|path| path.as_path().to_path_buf())
.collect();
unreadable_ancestors_of_writable_roots.sort_by_key(|path| path_depth(path));
for unreadable_root in &unreadable_ancestors_of_writable_roots {
append_unreadable_root_args(
&mut args,
&mut preserved_files,
unreadable_root,
&allowed_write_paths,
)?;
}
for writable_root in &sorted_writable_roots {
let root = writable_root.root.as_path();
if let Some(masking_root) = unreadable_ancestors_of_writable_roots
.iter()
.filter(|unreadable_root| root.starts_with(unreadable_root))
.max_by_key(|unreadable_root| path_depth(unreadable_root))
{
append_mount_target_parent_dir_args(&mut args, root, masking_root);
}
args.push("--bind".to_string());
args.push(path_to_string(root));
args.push(path_to_string(root));
let mut read_only_subpaths: Vec<PathBuf> = writable_root
.read_only_subpaths
.iter()
.map(|path| path.as_path().to_path_buf())
.filter(|path| !unreadable_paths.contains(path))
.collect();
read_only_subpaths.sort_by_key(|path| path_depth(path));
for subpath in read_only_subpaths {
append_read_only_subpath_args(&mut args, &subpath, &allowed_write_paths);
}
if !subpath.exists() {
// Keep this in the per-subpath loop: each protected subpath can have
// a different first missing component that must be blocked
// independently (for example, `/repo/.git` vs `/repo/.codex`).
if let Some(first_missing_component) = find_first_non_existent_component(&subpath)
&& is_within_allowed_write_paths(&first_missing_component, &allowed_write_paths)
{
args.push("--ro-bind".to_string());
args.push("/dev/null".to_string());
args.push(path_to_string(&first_missing_component));
}
continue;
}
if is_within_allowed_write_paths(&subpath, &allowed_write_paths) {
args.push("--ro-bind".to_string());
args.push(path_to_string(&subpath));
args.push(path_to_string(&subpath));
let mut nested_unreadable_roots: Vec<PathBuf> = unreadable_roots
.iter()
.filter(|path| path.as_path().starts_with(root))
.map(|path| path.as_path().to_path_buf())
.collect();
nested_unreadable_roots.sort_by_key(|path| path_depth(path));
for unreadable_root in nested_unreadable_roots {
append_unreadable_root_args(
&mut args,
&mut preserved_files,
&unreadable_root,
&allowed_write_paths,
)?;
}
}
if !unreadable_roots.is_empty() {
// Apply explicit deny carveouts after all readable and writable mounts
// so they win even when the broader baseline includes `/` or a writable
// parent path.
let null_file = File::open("/dev/null")?;
let null_fd = null_file.as_raw_fd().to_string();
for unreadable_root in unreadable_roots {
let unreadable_root = unreadable_root.as_path();
if unreadable_root.is_dir() {
// Bubblewrap cannot bind `/dev/null` over a directory, so mask
// denied directories by overmounting them with an empty tmpfs
// and then remounting that tmpfs read-only.
args.push("--perms".to_string());
args.push("000".to_string());
args.push("--tmpfs".to_string());
args.push(path_to_string(unreadable_root));
args.push("--remount-ro".to_string());
args.push(path_to_string(unreadable_root));
continue;
}
// For files, bind a stable null-file payload over the original path
// so later reads do not expose host contents. `--ro-bind-data`
// expects a live fd number, so keep the backing file open until we
// exec bubblewrap below.
args.push("--perms".to_string());
args.push("000".to_string());
args.push("--ro-bind-data".to_string());
args.push(null_fd.clone());
args.push(path_to_string(unreadable_root));
}
preserved_files.push(null_file);
let mut rootless_unreadable_roots: Vec<PathBuf> = unreadable_roots
.iter()
.filter(|path| {
let unreadable_root = path.as_path();
!allowed_write_paths
.iter()
.any(|root| unreadable_root.starts_with(root) || root.starts_with(unreadable_root))
})
.map(|path| path.as_path().to_path_buf())
.collect();
rootless_unreadable_roots.sort_by_key(|path| path_depth(path));
for unreadable_root in rootless_unreadable_roots {
append_unreadable_root_args(
&mut args,
&mut preserved_files,
&unreadable_root,
&allowed_write_paths,
)?;
}
Ok(BwrapArgs {
@@ -341,17 +361,6 @@ fn create_filesystem_args(
})
}
/// Collect unique read-only subpaths across all writable roots.
fn collect_read_only_subpaths(writable_roots: &[WritableRoot]) -> Vec<PathBuf> {
let mut subpaths: BTreeSet<PathBuf> = BTreeSet::new();
for writable_root in writable_roots {
for subpath in &writable_root.read_only_subpaths {
subpaths.insert(subpath.as_path().to_path_buf());
}
}
subpaths.into_iter().collect()
}
/// Validate that writable roots exist before constructing mounts.
///
/// Bubblewrap requires bind mount targets to exist. We fail fast with a clear
@@ -373,6 +382,107 @@ fn path_to_string(path: &Path) -> String {
path.to_string_lossy().to_string()
}
fn path_depth(path: &Path) -> usize {
path.components().count()
}
fn append_mount_target_parent_dir_args(args: &mut Vec<String>, mount_target: &Path, anchor: &Path) {
let mount_target_dir = if mount_target.is_dir() {
mount_target
} else {
match mount_target.parent() {
Some(parent) => parent,
None => return,
}
};
let mut mount_target_dirs: Vec<PathBuf> = mount_target_dir
.ancestors()
.take_while(|path| *path != anchor)
.map(Path::to_path_buf)
.collect();
mount_target_dirs.reverse();
for mount_target_dir in mount_target_dirs {
args.push("--dir".to_string());
args.push(path_to_string(&mount_target_dir));
}
}
fn append_read_only_subpath_args(
args: &mut Vec<String>,
subpath: &Path,
allowed_write_paths: &[PathBuf],
) {
if let Some(symlink_path) = find_symlink_in_path(subpath, allowed_write_paths) {
args.push("--ro-bind".to_string());
args.push("/dev/null".to_string());
args.push(path_to_string(&symlink_path));
return;
}
if !subpath.exists() {
if let Some(first_missing_component) = find_first_non_existent_component(subpath)
&& is_within_allowed_write_paths(&first_missing_component, allowed_write_paths)
{
args.push("--ro-bind".to_string());
args.push("/dev/null".to_string());
args.push(path_to_string(&first_missing_component));
}
return;
}
if is_within_allowed_write_paths(subpath, allowed_write_paths) {
args.push("--ro-bind".to_string());
args.push(path_to_string(subpath));
args.push(path_to_string(subpath));
}
}
fn append_unreadable_root_args(
args: &mut Vec<String>,
preserved_files: &mut Vec<File>,
unreadable_root: &Path,
allowed_write_paths: &[PathBuf],
) -> Result<()> {
if let Some(symlink_path) = find_symlink_in_path(unreadable_root, allowed_write_paths) {
args.push("--ro-bind".to_string());
args.push("/dev/null".to_string());
args.push(path_to_string(&symlink_path));
return Ok(());
}
if !unreadable_root.exists() {
if let Some(first_missing_component) = find_first_non_existent_component(unreadable_root)
&& is_within_allowed_write_paths(&first_missing_component, allowed_write_paths)
{
args.push("--ro-bind".to_string());
args.push("/dev/null".to_string());
args.push(path_to_string(&first_missing_component));
}
return Ok(());
}
if unreadable_root.is_dir() {
args.push("--perms".to_string());
args.push("000".to_string());
args.push("--tmpfs".to_string());
args.push(path_to_string(unreadable_root));
args.push("--remount-ro".to_string());
args.push(path_to_string(unreadable_root));
return Ok(());
}
if preserved_files.is_empty() {
preserved_files.push(File::open("/dev/null")?);
}
let null_fd = preserved_files[0].as_raw_fd().to_string();
args.push("--perms".to_string());
args.push("000".to_string());
args.push("--ro-bind-data".to_string());
args.push(null_fd);
args.push(path_to_string(unreadable_root));
Ok(())
}
/// Returns true when `path` is under any allowed writable root.
fn is_within_allowed_write_paths(path: &Path, allowed_write_paths: &[PathBuf]) -> bool {
allowed_write_paths
@@ -647,10 +757,227 @@ mod tests {
writable_root_str.as_str(),
]
}));
assert!(
args.args.windows(3).any(|window| {
window == ["--ro-bind", blocked_str.as_str(), blocked_str.as_str()]
let blocked_mask_index = args
.args
.windows(6)
.position(|window| {
window
== [
"--perms",
"000",
"--tmpfs",
blocked_str.as_str(),
"--remount-ro",
blocked_str.as_str(),
]
})
.expect("blocked directory should be remounted unreadable");
let writable_root_bind_index = args
.args
.windows(3)
.position(|window| {
window
== [
"--bind",
writable_root_str.as_str(),
writable_root_str.as_str(),
]
})
.expect("writable root should be rebound writable");
assert!(
writable_root_bind_index < blocked_mask_index,
"expected unreadable carveout to be re-applied after writable bind: {:#?}",
args.args
);
}
#[test]
fn split_policy_reenables_nested_writable_subpaths_after_read_only_parent() {
let temp_dir = TempDir::new().expect("temp dir");
let writable_root = temp_dir.path().join("workspace");
let docs = writable_root.join("docs");
let docs_public = docs.join("public");
std::fs::create_dir_all(&docs_public).expect("create docs/public");
let writable_root =
AbsolutePathBuf::from_absolute_path(&writable_root).expect("absolute writable root");
let docs = AbsolutePathBuf::from_absolute_path(&docs).expect("absolute docs");
let docs_public =
AbsolutePathBuf::from_absolute_path(&docs_public).expect("absolute docs/public");
let policy = FileSystemSandboxPolicy::restricted(vec![
FileSystemSandboxEntry {
path: FileSystemPath::Path {
path: writable_root,
},
access: FileSystemAccessMode::Write,
},
FileSystemSandboxEntry {
path: FileSystemPath::Path { path: docs.clone() },
access: FileSystemAccessMode::Read,
},
FileSystemSandboxEntry {
path: FileSystemPath::Path {
path: docs_public.clone(),
},
access: FileSystemAccessMode::Write,
},
]);
let args = create_filesystem_args(&policy, temp_dir.path()).expect("filesystem args");
let docs_str = path_to_string(docs.as_path());
let docs_public_str = path_to_string(docs_public.as_path());
let docs_ro_index = args
.args
.windows(3)
.position(|window| window == ["--ro-bind", docs_str.as_str(), docs_str.as_str()])
.expect("docs should be remounted read-only");
let docs_public_rw_index = args
.args
.windows(3)
.position(|window| {
window == ["--bind", docs_public_str.as_str(), docs_public_str.as_str()]
})
.expect("docs/public should be rebound writable");
assert!(
docs_ro_index < docs_public_rw_index,
"expected read-only parent remount before nested writable bind: {:#?}",
args.args
);
}
#[test]
fn split_policy_reenables_writable_subpaths_after_unreadable_parent() {
let temp_dir = TempDir::new().expect("temp dir");
let blocked = temp_dir.path().join("blocked");
let allowed = blocked.join("allowed");
std::fs::create_dir_all(&allowed).expect("create blocked/allowed");
let blocked = AbsolutePathBuf::from_absolute_path(&blocked).expect("absolute blocked");
let allowed = AbsolutePathBuf::from_absolute_path(&allowed).expect("absolute allowed");
let policy = FileSystemSandboxPolicy::restricted(vec![
FileSystemSandboxEntry {
path: FileSystemPath::Special {
value: FileSystemSpecialPath::Root,
},
access: FileSystemAccessMode::Read,
},
FileSystemSandboxEntry {
path: FileSystemPath::Path {
path: blocked.clone(),
},
access: FileSystemAccessMode::None,
},
FileSystemSandboxEntry {
path: FileSystemPath::Path {
path: allowed.clone(),
},
access: FileSystemAccessMode::Write,
},
]);
let args = create_filesystem_args(&policy, temp_dir.path()).expect("filesystem args");
let blocked_str = path_to_string(blocked.as_path());
let allowed_str = path_to_string(allowed.as_path());
let blocked_none_index = args
.args
.windows(4)
.position(|window| window == ["--perms", "000", "--tmpfs", blocked_str.as_str()])
.expect("blocked should be masked first");
let allowed_dir_index = args
.args
.windows(2)
.position(|window| window == ["--dir", allowed_str.as_str()])
.expect("allowed mount target should be recreated");
let allowed_bind_index = args
.args
.windows(3)
.position(|window| window == ["--bind", allowed_str.as_str(), allowed_str.as_str()])
.expect("allowed path should be rebound writable");
assert!(
blocked_none_index < allowed_dir_index && allowed_dir_index < allowed_bind_index,
"expected unreadable parent mask before recreating and rebinding writable child: {:#?}",
args.args
);
}
#[test]
fn split_policy_reenables_writable_files_after_unreadable_parent() {
let temp_dir = TempDir::new().expect("temp dir");
let blocked = temp_dir.path().join("blocked");
let allowed_dir = blocked.join("allowed");
let allowed_file = allowed_dir.join("note.txt");
std::fs::create_dir_all(&allowed_dir).expect("create blocked/allowed");
std::fs::write(&allowed_file, "ok").expect("create note");
let blocked = AbsolutePathBuf::from_absolute_path(&blocked).expect("absolute blocked");
let allowed_dir =
AbsolutePathBuf::from_absolute_path(&allowed_dir).expect("absolute allowed dir");
let allowed_file =
AbsolutePathBuf::from_absolute_path(&allowed_file).expect("absolute allowed file");
let policy = FileSystemSandboxPolicy::restricted(vec![
FileSystemSandboxEntry {
path: FileSystemPath::Special {
value: FileSystemSpecialPath::Root,
},
access: FileSystemAccessMode::Read,
},
FileSystemSandboxEntry {
path: FileSystemPath::Path {
path: blocked.clone(),
},
access: FileSystemAccessMode::None,
},
FileSystemSandboxEntry {
path: FileSystemPath::Path {
path: allowed_file.clone(),
},
access: FileSystemAccessMode::Write,
},
]);
let args = create_filesystem_args(&policy, temp_dir.path()).expect("filesystem args");
let blocked_str = path_to_string(blocked.as_path());
let allowed_dir_str = path_to_string(allowed_dir.as_path());
let allowed_file_str = path_to_string(allowed_file.as_path());
assert!(
args.args
.windows(2)
.any(|window| window == ["--dir", allowed_dir_str.as_str()]),
"expected ancestor directory to be recreated: {:#?}",
args.args
);
assert!(
!args
.args
.windows(2)
.any(|window| window == ["--dir", allowed_file_str.as_str()]),
"writable file target should not be converted into a directory: {:#?}",
args.args
);
let blocked_none_index = args
.args
.windows(4)
.position(|window| window == ["--perms", "000", "--tmpfs", blocked_str.as_str()])
.expect("blocked should be masked first");
let allowed_bind_index = args
.args
.windows(3)
.position(|window| {
window
== [
"--bind",
allowed_file_str.as_str(),
allowed_file_str.as_str(),
]
})
.expect("allowed file should be rebound writable");
assert!(
blocked_none_index < allowed_bind_index,
"expected unreadable parent mask before rebinding writable file child: {:#?}",
args.args
);
}