Split Linux metadata runtime modules

This commit is contained in:
Eva Wong
2026-04-29 16:40:41 -07:00
parent f476338f93
commit 3eab33bf0f
10 changed files with 1393 additions and 1324 deletions

View File

@@ -0,0 +1,418 @@
use std::fs::File;
use std::io::Read;
use std::os::fd::FromRawFd;
use std::path::Path;
use std::sync::atomic::AtomicI32;
use std::sync::atomic::Ordering;
use crate::bwrap::BwrapArgs;
use crate::file_system_protected_metadata_cleanup::cleanup_protected_create_targets;
use crate::file_system_protected_metadata_cleanup::cleanup_synthetic_mount_targets;
use crate::file_system_protected_metadata_cleanup::register_protected_create_targets;
use crate::file_system_protected_metadata_cleanup::register_synthetic_mount_targets;
use crate::file_system_protected_metadata_monitor::CreateMonitor;
use crate::launcher::exec_bwrap;
static BWRAP_CHILD_PID: AtomicI32 = AtomicI32::new(0);
static PENDING_FORWARDED_SIGNAL: AtomicI32 = AtomicI32::new(0);
const FORWARDED_SIGNALS: &[libc::c_int] =
&[libc::SIGHUP, libc::SIGINT, libc::SIGQUIT, libc::SIGTERM];
pub(crate) fn run_or_exec_bwrap(bwrap_args: BwrapArgs) -> ! {
let enforcement = &bwrap_args.file_system_permissions_enforcement;
if enforcement.synthetic_mount_targets.is_empty()
&& enforcement.protected_create_targets.is_empty()
{
exec_bwrap(bwrap_args.args, bwrap_args.preserved_files);
}
run_bwrap_with_file_system_protected_metadata_runtime(bwrap_args);
}
fn run_bwrap_with_file_system_protected_metadata_runtime(bwrap_args: BwrapArgs) -> ! {
let BwrapArgs {
args,
preserved_files,
file_system_permissions_enforcement,
} = bwrap_args;
let synthetic_mount_targets = file_system_permissions_enforcement.synthetic_mount_targets;
let protected_create_targets = file_system_permissions_enforcement.protected_create_targets;
let setup_signal_mask = ForwardedSignalMask::block();
let synthetic_mount_registrations = register_synthetic_mount_targets(&synthetic_mount_targets);
let protected_create_registrations =
register_protected_create_targets(&protected_create_targets);
let exec_start_pipe = create_exec_start_pipe(!protected_create_targets.is_empty());
let parent_pid = unsafe { libc::getpid() };
let pid = unsafe { libc::fork() };
if pid < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to fork for bubblewrap: {err}");
}
if pid == 0 {
reset_forwarded_signal_handlers_to_default();
setup_signal_mask.restore();
let setpgid_res = unsafe { libc::setpgid(0, 0) };
if setpgid_res < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to place bubblewrap child in its own process group: {err}");
}
terminate_with_parent(parent_pid);
wait_for_parent_exec_start(exec_start_pipe[0], exec_start_pipe[1]);
exec_bwrap(args, preserved_files);
}
close_child_exec_start_read(exec_start_pipe[0]);
let protected_create_monitor = CreateMonitor::start(
&protected_create_targets,
report_file_system_protected_metadata_violation,
);
let signal_forwarders = install_bwrap_signal_forwarders(pid);
release_child_exec_start(exec_start_pipe[1]);
setup_signal_mask.restore();
let status = wait_for_bwrap_child(pid);
let cleanup_signal_mask = ForwardedSignalMask::block();
BWRAP_CHILD_PID.store(0, Ordering::SeqCst);
let protected_create_monitor_violation = protected_create_monitor
.map(CreateMonitor::stop)
.unwrap_or(false);
cleanup_synthetic_mount_targets(&synthetic_mount_registrations);
let protected_create_violation = protected_create_monitor_violation
|| cleanup_protected_create_targets(
&protected_create_registrations,
report_file_system_protected_metadata_violation,
);
signal_forwarders.restore();
cleanup_signal_mask.restore();
exit_with_wait_status_or_policy_violation(status, protected_create_violation);
}
fn create_exec_start_pipe(enabled: bool) -> [libc::c_int; 2] {
if !enabled {
return [-1, -1];
}
let mut pipe = [-1, -1];
if unsafe { libc::pipe2(pipe.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to create bubblewrap exec start pipe: {err}");
}
pipe
}
fn wait_for_parent_exec_start(read_fd: libc::c_int, write_fd: libc::c_int) {
if write_fd >= 0 {
unsafe {
libc::close(write_fd);
}
}
if read_fd < 0 {
return;
}
let mut byte = [0_u8; 1];
loop {
let read = unsafe { libc::read(read_fd, byte.as_mut_ptr().cast(), byte.len()) };
if read >= 0 {
break;
}
let err = std::io::Error::last_os_error();
if err.kind() != std::io::ErrorKind::Interrupted {
break;
}
}
unsafe {
libc::close(read_fd);
}
}
fn close_child_exec_start_read(read_fd: libc::c_int) {
if read_fd >= 0 {
unsafe {
libc::close(read_fd);
}
}
}
fn release_child_exec_start(write_fd: libc::c_int) {
if write_fd < 0 {
return;
}
let byte = [0_u8; 1];
unsafe {
libc::write(write_fd, byte.as_ptr().cast(), byte.len());
libc::close(write_fd);
}
}
struct ForwardedSignalMask {
previous: libc::sigset_t,
}
struct ForwardedSignalHandlers {
previous: Vec<(libc::c_int, libc::sigaction)>,
}
impl ForwardedSignalMask {
fn block() -> Self {
let mut blocked: libc::sigset_t = unsafe { std::mem::zeroed() };
let mut previous: libc::sigset_t = unsafe { std::mem::zeroed() };
unsafe {
libc::sigemptyset(&mut blocked);
for signal in FORWARDED_SIGNALS {
libc::sigaddset(&mut blocked, *signal);
}
if libc::sigprocmask(libc::SIG_BLOCK, &blocked, &mut previous) < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to block bubblewrap forwarded signals: {err}");
}
}
Self { previous }
}
fn restore(&self) {
let mut restored = self.previous;
unsafe {
for signal in FORWARDED_SIGNALS {
libc::sigdelset(&mut restored, *signal);
}
if libc::sigprocmask(libc::SIG_SETMASK, &restored, std::ptr::null_mut()) < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to restore bubblewrap forwarded signals: {err}");
}
}
}
}
fn terminate_with_parent(parent_pid: libc::pid_t) {
let res = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) };
if res < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to set bubblewrap child parent-death signal: {err}");
}
if unsafe { libc::getppid() } != parent_pid {
unsafe {
libc::raise(libc::SIGTERM);
}
}
}
impl ForwardedSignalHandlers {
fn restore(self) {
BWRAP_CHILD_PID.store(0, Ordering::SeqCst);
PENDING_FORWARDED_SIGNAL.store(0, Ordering::SeqCst);
for (signal, previous_action) in self.previous {
unsafe {
if libc::sigaction(signal, &previous_action, std::ptr::null_mut()) < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to restore bubblewrap signal handler for {signal}: {err}");
}
}
}
}
}
fn install_bwrap_signal_forwarders(pid: libc::pid_t) -> ForwardedSignalHandlers {
BWRAP_CHILD_PID.store(pid, Ordering::SeqCst);
let mut previous = Vec::with_capacity(FORWARDED_SIGNALS.len());
for signal in FORWARDED_SIGNALS {
let mut action: libc::sigaction = unsafe { std::mem::zeroed() };
let mut previous_action: libc::sigaction = unsafe { std::mem::zeroed() };
action.sa_sigaction = forward_signal_to_bwrap_child as *const () as libc::sighandler_t;
unsafe {
libc::sigemptyset(&mut action.sa_mask);
if libc::sigaction(*signal, &action, &mut previous_action) < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to install bubblewrap signal forwarder for {signal}: {err}");
}
}
previous.push((*signal, previous_action));
}
replay_pending_forwarded_signal(pid);
ForwardedSignalHandlers { previous }
}
extern "C" fn forward_signal_to_bwrap_child(signal: libc::c_int) {
PENDING_FORWARDED_SIGNAL.store(signal, Ordering::SeqCst);
let pid = BWRAP_CHILD_PID.load(Ordering::SeqCst);
if pid > 0 {
send_signal_to_bwrap_child(pid, signal);
}
}
fn replay_pending_forwarded_signal(pid: libc::pid_t) {
let signal = PENDING_FORWARDED_SIGNAL.swap(0, Ordering::SeqCst);
if signal > 0 {
send_signal_to_bwrap_child(pid, signal);
}
}
fn send_signal_to_bwrap_child(pid: libc::pid_t, signal: libc::c_int) {
unsafe {
libc::kill(-pid, signal);
libc::kill(pid, signal);
}
}
fn reset_forwarded_signal_handlers_to_default() {
for signal in FORWARDED_SIGNALS {
unsafe {
if libc::signal(*signal, libc::SIG_DFL) == libc::SIG_ERR {
let err = std::io::Error::last_os_error();
panic!("failed to reset bubblewrap signal handler for {signal}: {err}");
}
}
}
}
fn wait_for_bwrap_child(pid: libc::pid_t) -> libc::c_int {
loop {
let mut status: libc::c_int = 0;
let wait_res = unsafe { libc::waitpid(pid, &mut status as *mut libc::c_int, 0) };
if wait_res >= 0 {
return status;
}
let err = std::io::Error::last_os_error();
if err.raw_os_error() == Some(libc::EINTR) {
continue;
}
panic!("waitpid failed for bubblewrap child: {err}");
}
}
fn exit_with_wait_status(status: libc::c_int) -> ! {
if libc::WIFEXITED(status) {
std::process::exit(libc::WEXITSTATUS(status));
}
if libc::WIFSIGNALED(status) {
let signal = libc::WTERMSIG(status);
unsafe {
libc::signal(signal, libc::SIG_DFL);
libc::kill(libc::getpid(), signal);
}
std::process::exit(128 + signal);
}
std::process::exit(1);
}
fn exit_with_wait_status_or_policy_violation(
status: libc::c_int,
protected_create_violation: bool,
) -> ! {
if protected_create_violation && libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 {
std::process::exit(1);
}
exit_with_wait_status(status);
}
/// Run a short-lived bubblewrap preflight in a child process and capture stderr.
///
/// Strategy:
/// - This is used only by `preflight_proc_mount_support`, which runs `/bin/true`
/// under bubblewrap with `--proc /proc`.
/// - The goal is to detect environments where mounting `/proc` fails (for
/// example, restricted containers), so we can retry the real run with
/// `--no-proc`.
/// - We capture stderr from that preflight to match known mount-failure text.
/// We do not stream it because this is a one-shot probe with a trivial
/// command, and reads are bounded to a fixed max size.
pub(crate) fn run_bwrap_in_child_capture_stderr(bwrap_args: BwrapArgs) -> String {
const MAX_PREFLIGHT_STDERR_BYTES: u64 = 64 * 1024;
let BwrapArgs {
args,
preserved_files,
file_system_permissions_enforcement,
} = bwrap_args;
let synthetic_mount_targets = file_system_permissions_enforcement.synthetic_mount_targets;
let protected_create_targets = file_system_permissions_enforcement.protected_create_targets;
let setup_signal_mask = ForwardedSignalMask::block();
let synthetic_mount_registrations = register_synthetic_mount_targets(&synthetic_mount_targets);
let protected_create_registrations =
register_protected_create_targets(&protected_create_targets);
let mut pipe_fds = [0; 2];
let pipe_res = unsafe { libc::pipe2(pipe_fds.as_mut_ptr(), libc::O_CLOEXEC) };
if pipe_res < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to create stderr pipe for bubblewrap: {err}");
}
let read_fd = pipe_fds[0];
let write_fd = pipe_fds[1];
let pid = unsafe { libc::fork() };
if pid < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to fork for bubblewrap: {err}");
}
if pid == 0 {
reset_forwarded_signal_handlers_to_default();
setup_signal_mask.restore();
// Child: redirect stderr to the pipe, then run bubblewrap.
unsafe {
close_fd_or_panic(read_fd, "close read end in bubblewrap child");
if libc::dup2(write_fd, libc::STDERR_FILENO) < 0 {
let err = std::io::Error::last_os_error();
panic!("failed to redirect stderr for bubblewrap: {err}");
}
close_fd_or_panic(write_fd, "close write end in bubblewrap child");
}
exec_bwrap(args, preserved_files);
}
let signal_forwarders = install_bwrap_signal_forwarders(pid);
setup_signal_mask.restore();
// Parent: close the write end and read stderr while the child runs.
close_fd_or_panic(write_fd, "close write end in bubblewrap parent");
// SAFETY: `read_fd` is a valid owned fd in the parent.
let mut read_file = unsafe { File::from_raw_fd(read_fd) };
let mut stderr_bytes = Vec::new();
let mut limited_reader = (&mut read_file).take(MAX_PREFLIGHT_STDERR_BYTES);
if let Err(err) = limited_reader.read_to_end(&mut stderr_bytes) {
panic!("failed to read bubblewrap stderr: {err}");
}
let status = wait_for_bwrap_child(pid);
let cleanup_signal_mask = ForwardedSignalMask::block();
BWRAP_CHILD_PID.store(0, Ordering::SeqCst);
cleanup_synthetic_mount_targets(&synthetic_mount_registrations);
cleanup_protected_create_targets(
&protected_create_registrations,
report_file_system_protected_metadata_violation,
);
signal_forwarders.restore();
cleanup_signal_mask.restore();
if libc::WIFSIGNALED(status) {
exit_with_wait_status(status);
}
String::from_utf8_lossy(&stderr_bytes).into_owned()
}
fn report_file_system_protected_metadata_violation(path: &Path) {
eprintln!(
"sandbox blocked creation of protected workspace metadata path {}",
path.display()
);
}
/// Close an owned file descriptor and panic with context on failure.
///
/// We use explicit close() checks here (instead of ignoring return codes)
/// because this code runs in low-level sandbox setup paths where fd leaks or
/// close errors can mask the root cause of later failures.
fn close_fd_or_panic(fd: libc::c_int, context: &str) {
let close_res = unsafe { libc::close(fd) };
if close_res < 0 {
let err = std::io::Error::last_os_error();
panic!("{context}: {err}");
}
}
#[cfg(test)]
#[path = "file_system_protected_metadata_runtime_tests.rs"]
mod tests;