From 4e8199b572f2035b7749cba276ece3a26630d23e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:18:21 +0200 Subject: Adding upstream version 1.67.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/rustix/src/thread/clock.rs | 14 +- vendor/rustix/src/thread/futex.rs | 6 +- vendor/rustix/src/thread/id.rs | 4 +- vendor/rustix/src/thread/mod.rs | 9 + vendor/rustix/src/thread/prctl.rs | 989 ++++++++++++++++++++++++++++++++++++++ vendor/rustix/src/thread/setns.rs | 89 ++++ 6 files changed, 1100 insertions(+), 11 deletions(-) create mode 100644 vendor/rustix/src/thread/prctl.rs create mode 100644 vendor/rustix/src/thread/setns.rs (limited to 'vendor/rustix/src/thread') diff --git a/vendor/rustix/src/thread/clock.rs b/vendor/rustix/src/thread/clock.rs index 206703088..57672fa17 100644 --- a/vendor/rustix/src/thread/clock.rs +++ b/vendor/rustix/src/thread/clock.rs @@ -1,6 +1,6 @@ -use crate::{imp, io}; +use crate::{backend, io}; -pub use imp::time::types::Timespec; +pub use backend::time::types::Timespec; #[cfg(not(any( target_os = "dragonfly", @@ -12,7 +12,7 @@ pub use imp::time::types::Timespec; target_os = "redox", target_os = "wasi", )))] -pub use imp::time::types::ClockId; +pub use backend::time::types::ClockId; /// `clock_nanosleep(id, 0, request, remain)`—Sleeps for a duration on a /// given clock. @@ -30,6 +30,7 @@ pub use imp::time::types::ClockId; target_os = "dragonfly", target_os = "emscripten", target_os = "freebsd", // FreeBSD 12 has clock_nanosleep, but libc targets FreeBSD 11. + target_os = "haiku", target_os = "ios", target_os = "macos", target_os = "openbsd", @@ -38,7 +39,7 @@ pub use imp::time::types::ClockId; )))] #[inline] pub fn clock_nanosleep_relative(id: ClockId, request: &Timespec) -> NanosleepRelativeResult { - imp::thread::syscalls::clock_nanosleep_relative(id, request) + backend::thread::syscalls::clock_nanosleep_relative(id, request) } /// `clock_nanosleep(id, TIMER_ABSTIME, request, NULL)`—Sleeps until an @@ -57,6 +58,7 @@ pub fn clock_nanosleep_relative(id: ClockId, request: &Timespec) -> NanosleepRel target_os = "dragonfly", target_os = "emscripten", target_os = "freebsd", // FreeBSD 12 has clock_nanosleep, but libc targets FreeBSD 11. + target_os = "haiku", target_os = "ios", target_os = "macos", target_os = "openbsd", @@ -65,7 +67,7 @@ pub fn clock_nanosleep_relative(id: ClockId, request: &Timespec) -> NanosleepRel )))] #[inline] pub fn clock_nanosleep_absolute(id: ClockId, request: &Timespec) -> io::Result<()> { - imp::thread::syscalls::clock_nanosleep_absolute(id, request) + backend::thread::syscalls::clock_nanosleep_absolute(id, request) } /// `nanosleep(request, remain)`—Sleeps for a duration. @@ -80,7 +82,7 @@ pub fn clock_nanosleep_absolute(id: ClockId, request: &Timespec) -> io::Result<( /// [Linux]: https://man7.org/linux/man-pages/man2/nanosleep.2.html #[inline] pub fn nanosleep(request: &Timespec) -> NanosleepRelativeResult { - imp::thread::syscalls::nanosleep(request) + backend::thread::syscalls::nanosleep(request) } /// A return type for `nanosleep` and `clock_nanosleep_relative`. diff --git a/vendor/rustix/src/thread/futex.rs b/vendor/rustix/src/thread/futex.rs index df5b561f1..7c4399f7a 100644 --- a/vendor/rustix/src/thread/futex.rs +++ b/vendor/rustix/src/thread/futex.rs @@ -7,9 +7,9 @@ #![allow(unsafe_code)] use crate::thread::Timespec; -use crate::{imp, io}; +use crate::{backend, io}; -pub use imp::thread::{FutexFlags, FutexOperation}; +pub use backend::thread::{FutexFlags, FutexOperation}; /// `futex(uaddr, op, val, utime, uaddr2, val3)` /// @@ -34,5 +34,5 @@ pub unsafe fn futex( uaddr2: *mut u32, val3: u32, ) -> io::Result { - imp::thread::syscalls::futex(uaddr, op, flags, val, utime, uaddr2, val3) + backend::thread::syscalls::futex(uaddr, op, flags, val, utime, uaddr2, val3) } diff --git a/vendor/rustix/src/thread/id.rs b/vendor/rustix/src/thread/id.rs index 964d2654c..0d2fef026 100644 --- a/vendor/rustix/src/thread/id.rs +++ b/vendor/rustix/src/thread/id.rs @@ -1,4 +1,4 @@ -use crate::imp; +use crate::backend; use crate::process::Pid; /// `gettid()`—Returns the thread ID. @@ -13,5 +13,5 @@ use crate::process::Pid; #[inline] #[must_use] pub fn gettid() -> Pid { - imp::thread::syscalls::gettid() + backend::thread::syscalls::gettid() } diff --git a/vendor/rustix/src/thread/mod.rs b/vendor/rustix/src/thread/mod.rs index ac48b435b..b1dc849d9 100644 --- a/vendor/rustix/src/thread/mod.rs +++ b/vendor/rustix/src/thread/mod.rs @@ -6,11 +6,16 @@ mod clock; mod futex; #[cfg(any(target_os = "android", target_os = "linux"))] mod id; +#[cfg(any(target_os = "android", target_os = "linux"))] +mod prctl; +#[cfg(any(target_os = "android", target_os = "linux"))] +mod setns; #[cfg(not(any( target_os = "dragonfly", target_os = "emscripten", target_os = "freebsd", + target_os = "haiku", target_os = "ios", target_os = "macos", target_os = "openbsd", @@ -24,3 +29,7 @@ pub use clock::{nanosleep, NanosleepRelativeResult, Timespec}; pub use futex::{futex, FutexFlags, FutexOperation}; #[cfg(any(target_os = "android", target_os = "linux"))] pub use id::gettid; +#[cfg(any(target_os = "android", target_os = "linux"))] +pub use prctl::*; +#[cfg(any(target_os = "android", target_os = "linux"))] +pub use setns::*; diff --git a/vendor/rustix/src/thread/prctl.rs b/vendor/rustix/src/thread/prctl.rs new file mode 100644 index 000000000..a2191f7c3 --- /dev/null +++ b/vendor/rustix/src/thread/prctl.rs @@ -0,0 +1,989 @@ +#![allow(unsafe_code)] + +use core::convert::TryFrom; +use core::mem::MaybeUninit; +use core::num::NonZeroU64; +use core::ptr; +use core::ptr::NonNull; +use core::sync::atomic::AtomicU8; + +use bitflags::bitflags; + +use crate::backend::c::{c_int, c_uint, c_void}; +use crate::backend::process::syscalls; +use crate::ffi::{CStr, CString}; +use crate::io; +use crate::process::{ + prctl_1arg, prctl_2args, prctl_3args, prctl_get_at_arg2_optional, Pid, + PointerAuthenticationKeys, +}; + +// +// PR_GET_KEEPCAPS/PR_SET_KEEPCAPS +// + +const PR_GET_KEEPCAPS: c_int = 7; + +/// Get the current state of the calling thread's `keep capabilities` flag. +/// +/// # References +/// - [`prctl(PR_GET_KEEPCAPS,...)`] +/// +/// [`prctl(PR_GET_KEEPCAPS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn get_keep_capabilities() -> io::Result { + unsafe { prctl_1arg(PR_GET_KEEPCAPS) }.map(|r| r != 0) +} + +const PR_SET_KEEPCAPS: c_int = 8; + +/// Set the state of the calling thread's `keep capabilities` flag. +/// +/// # References +/// - [`prctl(PR_SET_KEEPCAPS,...)`] +/// +/// [`prctl(PR_SET_KEEPCAPS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn set_keep_capabilities(enable: bool) -> io::Result<()> { + unsafe { prctl_2args(PR_SET_KEEPCAPS, enable as usize as *mut _) }.map(|_r| ()) +} + +// +// PR_GET_NAME/PR_SET_NAME +// + +const PR_GET_NAME: c_int = 16; + +/// Get the name of the calling thread. +/// +/// # References +/// - [`prctl(PR_GET_NAME,...)`] +/// +/// [`prctl(PR_GET_NAME,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn name() -> io::Result { + let mut buffer = [0_u8; 16]; + unsafe { prctl_2args(PR_GET_NAME, buffer.as_mut_ptr().cast())? }; + + let len = buffer.iter().position(|&x| x == 0_u8).unwrap_or(0); + CString::new(&buffer[..len]).map_err(|_r| io::Errno::ILSEQ) +} + +const PR_SET_NAME: c_int = 15; + +/// Set the name of the calling thread. +/// +/// # References +/// - [`prctl(PR_SET_NAME,...)`] +/// +/// [`prctl(PR_SET_NAME,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn set_name(name: &CStr) -> io::Result<()> { + unsafe { prctl_2args(PR_SET_NAME, name.as_ptr() as *mut _) }.map(|_r| ()) +} + +// +// PR_GET_SECCOMP/PR_SET_SECCOMP +// + +//const PR_GET_SECCOMP: c_int = 21; + +const SECCOMP_MODE_DISABLED: i32 = 0; +const SECCOMP_MODE_STRICT: i32 = 1; +const SECCOMP_MODE_FILTER: i32 = 2; + +/// `SECCOMP_MODE_*`. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(i32)] +pub enum SecureComputingMode { + /// Secure computing is not in use. + Disabled = SECCOMP_MODE_DISABLED, + /// Use hard-coded filter. + Strict = SECCOMP_MODE_STRICT, + /// Use user-supplied filter. + Filter = SECCOMP_MODE_FILTER, +} + +impl TryFrom for SecureComputingMode { + type Error = io::Errno; + + fn try_from(value: i32) -> Result { + match value { + SECCOMP_MODE_DISABLED => Ok(Self::Disabled), + SECCOMP_MODE_STRICT => Ok(Self::Strict), + SECCOMP_MODE_FILTER => Ok(Self::Filter), + _ => Err(io::Errno::RANGE), + } + } +} + +/* +/// Get the secure computing mode of the calling thread. +/// +/// If the caller is not in secure computing mode, this returns [`SecureComputingMode::Disabled`]. +/// If the caller is in strict secure computing mode, then this call will cause a `SIGKILL` signal +/// to be sent to the process. +/// If the caller is in filter mode, and this system call is allowed by the seccomp filters, +/// it returns [`SecureComputingMode::Filter`]; otherwise, the process is killed with +/// a `SIGKILL` signal. +/// +/// Since Linux 3.8, the Seccomp field of the `/proc/[pid]/status` file provides a method +/// of obtaining the same information, without the risk that the process is killed; see `proc(5)`. +/// +/// # References +/// - [`prctl(PR_GET_SECCOMP,...)`] +/// +/// [`prctl(PR_GET_SECCOMP,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn secure_computing_mode() -> io::Result { + unsafe { prctl_1arg(PR_GET_SECCOMP) }.and_then(TryInto::try_into) +} +*/ + +const PR_SET_SECCOMP: c_int = 22; + +/// Set the secure computing mode for the calling thread, to limit the available system calls. +/// +/// # References +/// - [`prctl(PR_SET_SECCOMP,...)`] +/// +/// [`prctl(PR_SET_SECCOMP,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn set_secure_computing_mode(mode: SecureComputingMode) -> io::Result<()> { + unsafe { prctl_2args(PR_SET_SECCOMP, mode as usize as *mut _) }.map(|_r| ()) +} + +// +// PR_CAPBSET_READ/PR_CAPBSET_DROP +// + +const PR_CAPBSET_READ: c_int = 23; + +const CAP_CHOWN: u32 = 0; +const CAP_DAC_OVERRIDE: u32 = 1; +const CAP_DAC_READ_SEARCH: u32 = 2; +const CAP_FOWNER: u32 = 3; +const CAP_FSETID: u32 = 4; +const CAP_KILL: u32 = 5; +const CAP_SETGID: u32 = 6; +const CAP_SETUID: u32 = 7; +const CAP_SETPCAP: u32 = 8; +const CAP_LINUX_IMMUTABLE: u32 = 9; +const CAP_NET_BIND_SERVICE: u32 = 10; +const CAP_NET_BROADCAST: u32 = 11; +const CAP_NET_ADMIN: u32 = 12; +const CAP_NET_RAW: u32 = 13; +const CAP_IPC_LOCK: u32 = 14; +const CAP_IPC_OWNER: u32 = 15; +const CAP_SYS_MODULE: u32 = 16; +const CAP_SYS_RAWIO: u32 = 17; +const CAP_SYS_CHROOT: u32 = 18; +const CAP_SYS_PTRACE: u32 = 19; +const CAP_SYS_PACCT: u32 = 20; +const CAP_SYS_ADMIN: u32 = 21; +const CAP_SYS_BOOT: u32 = 22; +const CAP_SYS_NICE: u32 = 23; +const CAP_SYS_RESOURCE: u32 = 24; +const CAP_SYS_TIME: u32 = 25; +const CAP_SYS_TTY_CONFIG: u32 = 26; +const CAP_MKNOD: u32 = 27; +const CAP_LEASE: u32 = 28; +const CAP_AUDIT_WRITE: u32 = 29; +const CAP_AUDIT_CONTROL: u32 = 30; +const CAP_SETFCAP: u32 = 31; +const CAP_MAC_OVERRIDE: u32 = 32; +const CAP_MAC_ADMIN: u32 = 33; +const CAP_SYSLOG: u32 = 34; +const CAP_WAKE_ALARM: u32 = 35; +const CAP_BLOCK_SUSPEND: u32 = 36; +const CAP_AUDIT_READ: u32 = 37; +const CAP_PERFMON: u32 = 38; +const CAP_BPF: u32 = 39; +const CAP_CHECKPOINT_RESTORE: u32 = 40; + +/// Linux per-thread capability. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(u32)] +pub enum Capability { + /// In a system with the `_POSIX_CHOWN_RESTRICTED` option defined, this overrides + /// the restriction of changing file ownership and group ownership. + ChangeOwnership = CAP_CHOWN, + /// Override all DAC access, including ACL execute access if `_POSIX_ACL` is defined. + /// Excluding DAC access covered by [`Capability::LinuxImmutable`]. + DACOverride = CAP_DAC_OVERRIDE, + /// Overrides all DAC restrictions regarding read and search on files and directories, + /// including ACL restrictions if `_POSIX_ACL` is defined. Excluding DAC access covered + /// by [`Capability::LinuxImmutable`]. + DACReadSearch = CAP_DAC_READ_SEARCH, + /// Overrides all restrictions about allowed operations on files, where file owner ID must be + /// equal to the user ID, except where [`Capability::FileSetID`] is applicable. + /// It doesn't override MAC and DAC restrictions. + FileOwner = CAP_FOWNER, + /// Overrides the following restrictions that the effective user ID shall match the file owner + /// ID when setting the `S_ISUID` and `S_ISGID` bits on that file; that the effective group ID + /// (or one of the supplementary group IDs) shall match the file owner ID when setting the + /// `S_ISGID` bit on that file; that the `S_ISUID` and `S_ISGID` bits are cleared on successful + /// return from `chown` (not implemented). + FileSetID = CAP_FSETID, + /// Overrides the restriction that the real or effective user ID of a process sending a signal + /// must match the real or effective user ID of the process receiving the signal. + Kill = CAP_KILL, + /// Allows `setgid` manipulation. Allows `setgroups`. Allows forged gids on socket + /// credentials passing. + SetGroupID = CAP_SETGID, + /// Allows `set*uid` manipulation (including fsuid). Allows forged pids on socket + /// credentials passing. + SetUserID = CAP_SETUID, + /// Without VFS support for capabilities: + /// - Transfer any capability in your permitted set to any pid. + /// - remove any capability in your permitted set from any pid. + /// With VFS support for capabilities (neither of above, but) + /// - Add any capability from current's capability bounding set to the current process' + /// inheritable set. + /// - Allow taking bits out of capability bounding set. + /// - Allow modification of the securebits for a process. + SetPermittedCapabilities = CAP_SETPCAP, + /// Allow modification of `S_IMMUTABLE` and `S_APPEND` file attributes. + LinuxImmutable = CAP_LINUX_IMMUTABLE, + /// Allows binding to TCP/UDP sockets below 1024. Allows binding to ATM VCIs below 32. + NetBindService = CAP_NET_BIND_SERVICE, + /// Allow broadcasting, listen to multicast. + NetBroadcast = CAP_NET_BROADCAST, + /// Allow interface configuration. Allow administration of IP firewall, masquerading and + /// accounting. Allow setting debug option on sockets. Allow modification of routing tables. + /// Allow setting arbitrary process / process group ownership on sockets. Allow binding to any + /// address for transparent proxying (also via [`Capability::NetRaw`]). Allow setting TOS + /// (type of service). Allow setting promiscuous mode. Allow clearing driver statistics. + /// Allow multicasting. Allow read/write of device-specific registers. Allow activation of ATM + /// control sockets. + NetAdmin = CAP_NET_ADMIN, + /// Allow use of `RAW` sockets. Allow use of `PACKET` sockets. Allow binding to any address for + /// transparent proxying (also via [`Capability::NetAdmin`]). + NetRaw = CAP_NET_RAW, + /// Allow locking of shared memory segments. Allow mlock and mlockall (which doesn't really have + /// anything to do with IPC). + IPCLock = CAP_IPC_LOCK, + /// Override IPC ownership checks. + IPCOwner = CAP_IPC_OWNER, + /// Insert and remove kernel modules - modify kernel without limit. + SystemModule = CAP_SYS_MODULE, + /// Allow ioperm/iopl access. Allow sending USB messages to any device via `/dev/bus/usb`. + SystemRawIO = CAP_SYS_RAWIO, + /// Allow use of `chroot`. + SystemChangeRoot = CAP_SYS_CHROOT, + /// Allow `ptrace` of any process. + SystemProcessTrace = CAP_SYS_PTRACE, + /// Allow configuration of process accounting. + SystemProcessAccounting = CAP_SYS_PACCT, + /// Allow configuration of the secure attention key. Allow administration of the random device. + /// Allow examination and configuration of disk quotas. Allow setting the domainname. + /// Allow setting the hostname. Allow `mount` and `umount`, setting up new smb connection. + /// Allow some autofs root ioctls. Allow nfsservctl. Allow `VM86_REQUEST_IRQ`. + /// Allow to read/write pci config on alpha. Allow `irix_prctl` on mips (setstacksize). + /// Allow flushing all cache on m68k (`sys_cacheflush`). Allow removing semaphores. + /// Used instead of [`Capability::ChangeOwnership`] to "chown" IPC message queues, semaphores + /// and shared memory. Allow locking/unlocking of shared memory segment. Allow turning swap + /// on/off. Allow forged pids on socket credentials passing. Allow setting readahead and + /// flushing buffers on block devices. Allow setting geometry in floppy driver. Allow turning + /// DMA on/off in `xd` driver. Allow administration of md devices (mostly the above, but some + /// extra ioctls). Allow tuning the ide driver. Allow access to the nvram device. Allow + /// administration of `apm_bios`, serial and bttv (TV) device. Allow manufacturer commands in + /// isdn CAPI support driver. Allow reading non-standardized portions of pci configuration + /// space. Allow DDI debug ioctl on sbpcd driver. Allow setting up serial ports. Allow sending + /// raw qic-117 commands. Allow enabling/disabling tagged queuing on SCSI controllers and + /// sending arbitrary SCSI commands. Allow setting encryption key on loopback filesystem. + /// Allow setting zone reclaim policy. Allow everything under + /// [`Capability::BerkeleyPacketFilters`] and [`Capability::PerformanceMonitoring`] for backward + /// compatibility. + SystemAdmin = CAP_SYS_ADMIN, + /// Allow use of `reboot`. + SystemBoot = CAP_SYS_BOOT, + /// Allow raising priority and setting priority on other (different UID) processes. Allow use of + /// FIFO and round-robin (realtime) scheduling on own processes and setting the scheduling + /// algorithm used by another process. Allow setting cpu affinity on other processes. + /// Allow setting realtime ioprio class. Allow setting ioprio class on other processes. + SystemNice = CAP_SYS_NICE, + /// Override resource limits. Set resource limits. Override quota limits. Override reserved + /// space on ext2 filesystem. Modify data journaling mode on ext3 filesystem (uses journaling + /// resources). NOTE: ext2 honors fsuid when checking for resource overrides, so you can + /// override using fsuid too. Override size restrictions on IPC message queues. Allow more than + /// 64hz interrupts from the real-time clock. Override max number of consoles on console + /// allocation. Override max number of keymaps. Control memory reclaim behavior. + SystemResource = CAP_SYS_RESOURCE, + /// Allow manipulation of system clock. Allow `irix_stime` on mips. Allow setting the real-time + /// clock. + SystemTime = CAP_SYS_TIME, + /// Allow configuration of tty devices. Allow `vhangup` of tty. + SystemTTYConfig = CAP_SYS_TTY_CONFIG, + /// Allow the privileged aspects of `mknod`. + MakeNode = CAP_MKNOD, + /// Allow taking of leases on files. + Lease = CAP_LEASE, + /// Allow writing the audit log via unicast netlink socket. + AuditWrite = CAP_AUDIT_WRITE, + /// Allow configuration of audit via unicast netlink socket. + AuditControl = CAP_AUDIT_CONTROL, + /// Set or remove capabilities on files. Map `uid=0` into a child user namespace. + SetFileCapabilities = CAP_SETFCAP, + /// Override MAC access. The base kernel enforces no MAC policy. An LSM may enforce a MAC + /// policy, and if it does and it chooses to implement capability based overrides of that + /// policy, this is the capability it should use to do so. + MACOverride = CAP_MAC_OVERRIDE, + /// Allow MAC configuration or state changes. The base kernel requires no MAC configuration. + /// An LSM may enforce a MAC policy, and if it does and it chooses to implement capability based + /// checks on modifications to that policy or the data required to maintain it, this is the + /// capability it should use to do so. + MACAdmin = CAP_MAC_ADMIN, + /// Allow configuring the kernel's `syslog` (`printk` behaviour). + SystemLog = CAP_SYSLOG, + /// Allow triggering something that will wake the system. + WakeAlarm = CAP_WAKE_ALARM, + /// Allow preventing system suspends. + BlockSuspend = CAP_BLOCK_SUSPEND, + /// Allow reading the audit log via multicast netlink socket. + AuditRead = CAP_AUDIT_READ, + /// Allow system performance and observability privileged operations using `perf_events`, + /// `i915_perf` and other kernel subsystems. + PerformanceMonitoring = CAP_PERFMON, + /// This capability allows the following BPF operations: + /// - Creating all types of BPF maps + /// - Advanced verifier features + /// - Indirect variable access + /// - Bounded loops + /// - BPF to BPF function calls + /// - Scalar precision tracking + /// - Larger complexity limits + /// - Dead code elimination + /// - And potentially other features + /// - Loading BPF Type Format (BTF) data + /// - Retrieve `xlated` and JITed code of BPF programs + /// - Use `bpf_spin_lock` helper + /// + /// [`Capability::PerformanceMonitoring`] relaxes the verifier checks further: + /// - BPF progs can use of pointer-to-integer conversions + /// - speculation attack hardening measures are bypassed + /// - `bpf_probe_read` to read arbitrary kernel memory is allowed + /// - `bpf_trace_printk` to print kernel memory is allowed + /// + /// [`Capability::SystemAdmin`] is required to use bpf_probe_write_user. + /// + /// [`Capability::SystemAdmin`] is required to iterate system wide loaded + /// programs, maps, links, BTFs and convert their IDs to file descriptors. + /// + /// [`Capability::PerformanceMonitoring`] and [`Capability::BerkeleyPacketFilters`] are required + /// to load tracing programs. + /// [`Capability::NetAdmin`] and [`Capability::BerkeleyPacketFilters`] are required to load + /// networking programs. + BerkeleyPacketFilters = CAP_BPF, + /// Allow checkpoint/restore related operations. Allow PID selection during `clone3`. + /// Allow writing to `ns_last_pid`. + CheckpointRestore = CAP_CHECKPOINT_RESTORE, +} + +/// Check if the specified capability is in the calling thread's capability bounding set. +/// +/// # References +/// - [`prctl(PR_CAPBSET_READ,...)`] +/// +/// [`prctl(PR_CAPBSET_READ,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn is_in_capability_bounding_set(capability: Capability) -> io::Result { + unsafe { prctl_2args(PR_CAPBSET_READ, capability as usize as *mut _) }.map(|r| r != 0) +} + +const PR_CAPBSET_DROP: c_int = 24; + +/// If the calling thread has the [`Capability::SetPermittedCapabilities`] capability within its +/// user namespace, then drop the specified capability from the thread's capability bounding set. +/// +/// # References +/// - [`prctl(PR_CAPBSET_DROP,...)`] +/// +/// [`prctl(PR_CAPBSET_DROP,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn remove_capability_from_capability_bounding_set(capability: Capability) -> io::Result<()> { + unsafe { prctl_2args(PR_CAPBSET_DROP, capability as usize as *mut _) }.map(|_r| ()) +} + +// +// PR_GET_SECUREBITS/PR_SET_SECUREBITS +// + +const PR_GET_SECUREBITS: c_int = 27; + +bitflags! { + /// `SECBIT_*`. + pub struct CapabilitiesSecureBits: u32 { + /// If this bit is set, then the kernel does not grant capabilities when + /// a `set-user-ID-root` program is executed, or when a process with an effective or real + /// UID of 0 calls `execve`. + const NO_ROOT = 1_u32 << 0; + /// Set [`NO_ROOT`] irreversibly. + const NO_ROOT_LOCKED = 1_u32 << 1; + /// Setting this flag stops the kernel from adjusting the process's permitted, effective, + /// and ambient capability sets when the thread's effective and filesystem UIDs are switched + /// between zero and nonzero values. + const NO_SETUID_FIXUP = 1_u32 << 2; + /// Set [`NO_SETUID_FIXUP`] irreversibly. + const NO_SETUID_FIXUP_LOCKED = 1_u32 << 3; + /// Setting this flag allows a thread that has one or more 0 UIDs to retain capabilities in + /// its permitted set when it switches all of its UIDs to nonzero values. + const KEEP_CAPS = 1_u32 << 4; + /// Set [`KEEP_CAPS`] irreversibly. + const KEEP_CAPS_LOCKED = 1_u32 << 5; + /// Setting this flag disallows raising ambient capabilities via the `prctl`'s + /// `PR_CAP_AMBIENT_RAISE` operation. + const NO_CAP_AMBIENT_RAISE = 1_u32 << 6; + /// Set [`NO_CAP_AMBIENT_RAISE`] irreversibly. + const NO_CAP_AMBIENT_RAISE_LOCKED = 1_u32 << 7; + } +} + +/// Get the `securebits` flags of the calling thread. +/// +/// # References +/// - [`prctl(PR_GET_SECUREBITS,...)`] +/// +/// [`prctl(PR_GET_SECUREBITS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn capabilities_secure_bits() -> io::Result { + let r = unsafe { prctl_1arg(PR_GET_SECUREBITS)? } as c_uint; + CapabilitiesSecureBits::from_bits(r).ok_or(io::Errno::RANGE) +} + +const PR_SET_SECUREBITS: c_int = 28; + +/// Set the `securebits` flags of the calling thread. +/// +/// # References +/// - [`prctl(PR_SET_SECUREBITS,...)`] +/// +/// [`prctl(PR_SET_SECUREBITS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn set_capabilities_secure_bits(bits: CapabilitiesSecureBits) -> io::Result<()> { + unsafe { prctl_2args(PR_SET_SECUREBITS, bits.bits() as usize as *mut _) }.map(|_r| ()) +} + +// +// PR_GET_TIMERSLACK/PR_SET_TIMERSLACK +// + +const PR_GET_TIMERSLACK: c_int = 30; + +/// Get the `current` timer slack value of the calling thread. +/// +/// # References +/// - [`prctl(PR_GET_TIMERSLACK,...)`] +/// +/// [`prctl(PR_GET_TIMERSLACK,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn current_timer_slack() -> io::Result { + unsafe { prctl_1arg(PR_GET_TIMERSLACK) }.map(|r| r as u64) +} + +const PR_SET_TIMERSLACK: c_int = 29; + +/// Sets the `current` timer slack value for the calling thread. +/// +/// # References +/// - [`prctl(PR_SET_TIMERSLACK,...)`] +/// +/// [`prctl(PR_SET_TIMERSLACK,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn set_current_timer_slack(value: Option) -> io::Result<()> { + let value = usize::try_from(value.map_or(0, NonZeroU64::get)).map_err(|_r| io::Errno::RANGE)?; + unsafe { prctl_2args(PR_SET_TIMERSLACK, value as *mut _) }.map(|_r| ()) +} + +// +// PR_GET_NO_NEW_PRIVS/PR_SET_NO_NEW_PRIVS +// + +const PR_GET_NO_NEW_PRIVS: c_int = 39; + +/// Get the value of the `no_new_privs` attribute for the calling thread. +/// +/// # References +/// - [`prctl(PR_GET_NO_NEW_PRIVS,...)`] +/// +/// [`prctl(PR_GET_NO_NEW_PRIVS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn no_new_privs() -> io::Result { + unsafe { prctl_1arg(PR_GET_NO_NEW_PRIVS) }.map(|r| r != 0) +} + +const PR_SET_NO_NEW_PRIVS: c_int = 38; + +/// Set the calling thread's `no_new_privs` attribute. +/// +/// # References +/// - [`prctl(PR_SET_NO_NEW_PRIVS,...)`] +/// +/// [`prctl(PR_SET_NO_NEW_PRIVS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn set_no_new_privs(no_new_privs: bool) -> io::Result<()> { + unsafe { prctl_2args(PR_SET_NO_NEW_PRIVS, no_new_privs as usize as *mut _) }.map(|_r| ()) +} + +// +// PR_GET_TID_ADDRESS +// + +const PR_GET_TID_ADDRESS: c_int = 40; + +/// Get the `clear_child_tid` address set by `set_tid_address` +/// and `clone`'s `CLONE_CHILD_CLEARTID` flag. +/// +/// # References +/// - [`prctl(PR_GET_TID_ADDRESS,...)`] +/// +/// [`prctl(PR_GET_TID_ADDRESS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn get_clear_child_tid_address() -> io::Result>> { + unsafe { prctl_get_at_arg2_optional::<*mut c_void>(PR_GET_TID_ADDRESS) }.map(NonNull::new) +} + +// +// PR_GET_THP_DISABLE/PR_SET_THP_DISABLE +// + +const PR_GET_THP_DISABLE: c_int = 42; + +/// Get the current setting of the `THP disable` flag for the calling thread. +/// +/// # References +/// - [`prctl(PR_GET_THP_DISABLE,...)`] +/// +/// [`prctl(PR_GET_THP_DISABLE,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn transparent_huge_pages_are_disabled() -> io::Result { + unsafe { prctl_1arg(PR_GET_THP_DISABLE) }.map(|r| r != 0) +} + +const PR_SET_THP_DISABLE: c_int = 41; + +/// Set the state of the `THP disable` flag for the calling thread. +/// +/// # References +/// - [`prctl(PR_SET_THP_DISABLE,...)`] +/// +/// [`prctl(PR_SET_THP_DISABLE,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn disable_transparent_huge_pages(thp_disable: bool) -> io::Result<()> { + unsafe { prctl_2args(PR_SET_THP_DISABLE, thp_disable as usize as *mut _) }.map(|_r| ()) +} + +// +// PR_CAP_AMBIENT +// + +const PR_CAP_AMBIENT: c_int = 47; + +const PR_CAP_AMBIENT_IS_SET: usize = 1; + +/// Check if the specified capability is in the ambient set. +/// +/// # References +/// - [`prctl(PR_CAP_AMBIENT,PR_CAP_AMBIENT_IS_SET,...)`] +/// +/// [`prctl(PR_CAP_AMBIENT,PR_CAP_AMBIENT_IS_SET,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn capability_is_in_ambient_capability_set(capability: Capability) -> io::Result { + let cap = capability as usize as *mut _; + unsafe { prctl_3args(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET as *mut _, cap) }.map(|r| r != 0) +} + +const PR_CAP_AMBIENT_CLEAR_ALL: usize = 4; + +/// Remove all capabilities from the ambient set. +/// +/// # References +/// - [`prctl(PR_CAP_AMBIENT,PR_CAP_AMBIENT_CLEAR_ALL,...)`] +/// +/// [`prctl(PR_CAP_AMBIENT,PR_CAP_AMBIENT_CLEAR_ALL,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn clear_ambient_capability_set() -> io::Result<()> { + unsafe { prctl_2args(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL as *mut _) }.map(|_r| ()) +} + +const PR_CAP_AMBIENT_RAISE: usize = 2; +const PR_CAP_AMBIENT_LOWER: usize = 3; + +/// Add or remove the specified capability to the ambient set. +/// +/// # References +/// - [`prctl(PR_CAP_AMBIENT,...)`] +/// +/// [`prctl(PR_CAP_AMBIENT,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn configure_capability_in_ambient_capability_set( + capability: Capability, + enable: bool, +) -> io::Result<()> { + let sub_operation = if enable { + PR_CAP_AMBIENT_RAISE + } else { + PR_CAP_AMBIENT_LOWER + }; + let cap = capability as usize as *mut _; + + unsafe { prctl_3args(PR_CAP_AMBIENT, sub_operation as *mut _, cap) }.map(|_r| ()) +} + +// +// PR_SVE_GET_VL/PR_SVE_SET_VL +// + +const PR_SVE_GET_VL: c_int = 51; + +const PR_SVE_VL_LEN_MASK: u32 = 0xffff; +const PR_SVE_VL_INHERIT: u32 = 1_u32 << 17; + +/// Scalable Vector Extension vector length configuration. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct SVEVectorLengthConfig { + /// Vector length in bytes. + pub vector_length_in_bytes: u32, + /// Vector length inherited across `execve`. + pub vector_length_inherited_across_execve: bool, +} + +/// Get the thread's current SVE vector length configuration. +/// +/// # References +/// - [`prctl(PR_SVE_GET_VL,...)`] +/// +/// [`prctl(PR_SVE_GET_VL,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn sve_vector_length_configuration() -> io::Result { + let bits = unsafe { prctl_1arg(PR_SVE_GET_VL)? } as c_uint; + Ok(SVEVectorLengthConfig { + vector_length_in_bytes: bits & PR_SVE_VL_LEN_MASK, + vector_length_inherited_across_execve: (bits & PR_SVE_VL_INHERIT) != 0, + }) +} + +const PR_SVE_SET_VL: c_int = 50; + +const PR_SVE_SET_VL_ONEXEC: u32 = 1_u32 << 18; + +/// Configure the thread's vector length of Scalable Vector Extension. +/// +/// # References +/// - [`prctl(PR_SVE_SET_VL,...)`] +/// +/// # Safety +/// +/// Please ensure the conditions necessary to safely call this function, +/// as detailed in the references above. +/// +/// [`prctl(PR_SVE_SET_VL,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub unsafe fn set_sve_vector_length_configuration( + vector_length_in_bytes: usize, + vector_length_inherited_across_execve: bool, + defer_change_to_next_execve: bool, +) -> io::Result<()> { + let vector_length_in_bytes = + u32::try_from(vector_length_in_bytes).map_err(|_r| io::Errno::RANGE)?; + + let mut bits = vector_length_in_bytes & PR_SVE_VL_LEN_MASK; + + if vector_length_inherited_across_execve { + bits |= PR_SVE_VL_INHERIT; + } + + if defer_change_to_next_execve { + bits |= PR_SVE_SET_VL_ONEXEC; + } + + prctl_2args(PR_SVE_SET_VL, bits as usize as *mut _).map(|_r| ()) +} + +// +// PR_PAC_RESET_KEYS +// + +const PR_PAC_RESET_KEYS: c_int = 54; + +/// Securely reset the thread's pointer authentication keys to fresh random values generated +/// by the kernel. +/// +/// # References +/// - [`prctl(PR_PAC_RESET_KEYS,...)`] +/// +/// # Safety +/// +/// Please ensure the conditions necessary to safely call this function, +/// as detailed in the references above. +/// +/// [`prctl(PR_PAC_RESET_KEYS,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub unsafe fn reset_pointer_authentication_keys( + keys: Option, +) -> io::Result<()> { + let keys = keys.as_ref().map_or(0_u32, PointerAuthenticationKeys::bits); + prctl_2args(PR_PAC_RESET_KEYS, keys as usize as *mut _).map(|_r| ()) +} + +// +// PR_GET_TAGGED_ADDR_CTRL/PR_SET_TAGGED_ADDR_CTRL +// + +const PR_GET_TAGGED_ADDR_CTRL: c_int = 56; + +const PR_MTE_TAG_SHIFT: u32 = 3; +const PR_MTE_TAG_MASK: u32 = 0xffff_u32 << PR_MTE_TAG_SHIFT; + +bitflags! { + /// Zero means addresses that are passed for the purpose of being dereferenced by the kernel must be untagged. + pub struct TaggedAddressMode: u32 { + /// Addresses that are passed for the purpose of being dereferenced by the kernel may be tagged. + const ENABLED = 1_u32 << 0; + /// Synchronous tag check fault mode. + const TCF_SYNC = 1_u32 << 1; + /// Asynchronous tag check fault mode. + const TCF_ASYNC = 1_u32 << 2; + } +} + +/// Get the current tagged address mode for the calling thread. +/// +/// # References +/// - [`prctl(PR_GET_TAGGED_ADDR_CTRL,...)`] +/// +/// [`prctl(PR_GET_TAGGED_ADDR_CTRL,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub fn current_tagged_address_mode() -> io::Result<(Option, u32)> { + let r = unsafe { prctl_1arg(PR_GET_TAGGED_ADDR_CTRL)? } as c_uint; + let mode = r & 0b111_u32; + let mte_tag = (r & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT; + Ok((TaggedAddressMode::from_bits(mode), mte_tag)) +} + +const PR_SET_TAGGED_ADDR_CTRL: c_int = 55; + +/// Controls support for passing tagged user-space addresses to the kernel. +/// +/// # References +/// - [`prctl(PR_SET_TAGGED_ADDR_CTRL,...)`] +/// +/// # Safety +/// +/// Please ensure the conditions necessary to safely call this function, +/// as detailed in the references above. +/// +/// [`prctl(PR_SET_TAGGED_ADDR_CTRL,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub unsafe fn set_current_tagged_address_mode( + mode: Option, + mte_tag: u32, +) -> io::Result<()> { + let config = mode.as_ref().map_or(0_u32, TaggedAddressMode::bits) + | ((mte_tag << PR_MTE_TAG_SHIFT) & PR_MTE_TAG_MASK); + prctl_2args(PR_SET_TAGGED_ADDR_CTRL, config as usize as *mut _).map(|_r| ()) +} + +// +// PR_SET_SYSCALL_USER_DISPATCH +// + +const PR_SET_SYSCALL_USER_DISPATCH: c_int = 59; + +const PR_SYS_DISPATCH_OFF: usize = 0; + +/// Disable Syscall User Dispatch mechanism. +/// +/// # References +/// - [`prctl(PR_SET_SYSCALL_USER_DISPATCH,PR_SYS_DISPATCH_OFF,...)`] +/// +/// # Safety +/// +/// Please ensure the conditions necessary to safely call this function, +/// as detailed in the references above. +/// +/// [`prctl(PR_SET_SYSCALL_USER_DISPATCH,PR_SYS_DISPATCH_OFF,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub unsafe fn disable_syscall_user_dispatch() -> io::Result<()> { + prctl_2args(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_OFF as *mut _).map(|_r| ()) +} + +const PR_SYS_DISPATCH_ON: usize = 1; + +/// Allow system calls to be executed. +const SYSCALL_DISPATCH_FILTER_ALLOW: u8 = 0; +/// Block system calls from executing. +const SYSCALL_DISPATCH_FILTER_BLOCK: u8 = 1; + +/// Value of the fast switch flag controlling system calls user dispatch mechanism without the need +/// to issue a syscall. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(u8)] +pub enum SysCallUserDispatchFastSwitch { + /// System calls are allowed to execute. + Allow = SYSCALL_DISPATCH_FILTER_ALLOW, + /// System calls are blocked from executing. + Block = SYSCALL_DISPATCH_FILTER_BLOCK, +} + +impl TryFrom for SysCallUserDispatchFastSwitch { + type Error = io::Errno; + + fn try_from(value: u8) -> Result { + match value { + SYSCALL_DISPATCH_FILTER_ALLOW => Ok(Self::Allow), + SYSCALL_DISPATCH_FILTER_BLOCK => Ok(Self::Block), + _ => Err(io::Errno::RANGE), + } + } +} + +/// Enable Syscall User Dispatch mechanism. +/// +/// # References +/// - [`prctl(PR_SET_SYSCALL_USER_DISPATCH,PR_SYS_DISPATCH_ON,...)`] +/// +/// # Safety +/// +/// Please ensure the conditions necessary to safely call this function, +/// as detailed in the references above. +/// +/// [`prctl(PR_SET_SYSCALL_USER_DISPATCH,PR_SYS_DISPATCH_ON,...)`]: https://man7.org/linux/man-pages/man2/prctl.2.html +#[inline] +pub unsafe fn enable_syscall_user_dispatch( + always_allowed_region: &[u8], + fast_switch_flag: &AtomicU8, +) -> io::Result<()> { + syscalls::prctl( + PR_SET_SYSCALL_USER_DISPATCH, + PR_SYS_DISPATCH_ON as *mut _, + always_allowed_region.as_ptr() as *mut _, + always_allowed_region.len() as *mut _, + fast_switch_flag as *const AtomicU8 as *mut _, + ) + .map(|_r| ()) +} + +// +// PR_SCHED_CORE +// + +const PR_SCHED_CORE: c_int = 62; + +const PR_SCHED_CORE_GET: usize = 0; + +const PR_SCHED_CORE_SCOPE_THREAD: u32 = 0; +const PR_SCHED_CORE_SCOPE_THREAD_GROUP: u32 = 1; +const PR_SCHED_CORE_SCOPE_PROCESS_GROUP: u32 = 2; + +/// `PR_SCHED_CORE_SCOPE_*`. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(u32)] +pub enum CoreSchedulingScope { + /// Operation will be performed for the thread. + Thread = PR_SCHED_CORE_SCOPE_THREAD, + /// Operation will be performed for all tasks in the task group of the process. + ThreadGroup = PR_SCHED_CORE_SCOPE_THREAD_GROUP, + /// Operation will be performed for all processes in the process group. + ProcessGroup = PR_SCHED_CORE_SCOPE_PROCESS_GROUP, +} + +impl TryFrom for CoreSchedulingScope { + type Error = io::Errno; + + fn try_from(value: u32) -> Result { + match value { + PR_SCHED_CORE_SCOPE_THREAD => Ok(Self::Thread), + PR_SCHED_CORE_SCOPE_THREAD_GROUP => Ok(Self::ThreadGroup), + PR_SCHED_CORE_SCOPE_PROCESS_GROUP => Ok(Self::ProcessGroup), + _ => Err(io::Errno::RANGE), + } + } +} + +/// Get core scheduling cookie of a process. +/// +/// # References +/// - [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_GET,...)`] +/// +/// [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_GET,...)`]: https://www.kernel.org/doc/html/v5.18/admin-guide/hw-vuln/core-scheduling.html +#[inline] +pub fn core_scheduling_cookie(pid: Pid, scope: CoreSchedulingScope) -> io::Result { + let mut value: MaybeUninit = MaybeUninit::uninit(); + unsafe { + syscalls::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_GET as *mut _, + pid.as_raw_nonzero().get() as usize as *mut _, + scope as usize as *mut _, + value.as_mut_ptr().cast(), + )?; + Ok(value.assume_init()) + } +} + +const PR_SCHED_CORE_CREATE: usize = 1; + +/// Create unique core scheduling cookie. +/// +/// # References +/// - [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_CREATE,...)`] +/// +/// [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_CREATE,...)`]: https://www.kernel.org/doc/html/v5.18/admin-guide/hw-vuln/core-scheduling.html +#[inline] +pub fn create_core_scheduling_cookie(pid: Pid, scope: CoreSchedulingScope) -> io::Result<()> { + unsafe { + syscalls::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_CREATE as *mut _, + pid.as_raw_nonzero().get() as usize as *mut _, + scope as usize as *mut _, + ptr::null_mut(), + ) + .map(|_r| ()) + } +} + +const PR_SCHED_CORE_SHARE_TO: usize = 2; + +/// Push core scheduling cookie to a process. +/// +/// # References +/// - [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_SHARE_TO,...)`] +/// +/// [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_SHARE_TO,...)`]: https://www.kernel.org/doc/html/v5.18/admin-guide/hw-vuln/core-scheduling.html +#[inline] +pub fn push_core_scheduling_cookie(pid: Pid, scope: CoreSchedulingScope) -> io::Result<()> { + unsafe { + syscalls::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_SHARE_TO as *mut _, + pid.as_raw_nonzero().get() as usize as *mut _, + scope as usize as *mut _, + ptr::null_mut(), + ) + .map(|_r| ()) + } +} + +const PR_SCHED_CORE_SHARE_FROM: usize = 3; + +/// Pull core scheduling cookie from a process. +/// +/// # References +/// - [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_SHARE_FROM,...)`] +/// +/// [`prctl(PR_SCHED_CORE,PR_SCHED_CORE_SHARE_FROM,...)`]: https://www.kernel.org/doc/html/v5.18/admin-guide/hw-vuln/core-scheduling.html +#[inline] +pub fn pull_core_scheduling_cookie(pid: Pid, scope: CoreSchedulingScope) -> io::Result<()> { + unsafe { + syscalls::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_SHARE_FROM as *mut _, + pid.as_raw_nonzero().get() as usize as *mut _, + scope as usize as *mut _, + ptr::null_mut(), + ) + .map(|_r| ()) + } +} diff --git a/vendor/rustix/src/thread/setns.rs b/vendor/rustix/src/thread/setns.rs new file mode 100644 index 000000000..0a5564ae1 --- /dev/null +++ b/vendor/rustix/src/thread/setns.rs @@ -0,0 +1,89 @@ +#![allow(unsafe_code)] + +use bitflags::bitflags; +use linux_raw_sys::general::{ + CLONE_NEWCGROUP, CLONE_NEWIPC, CLONE_NEWNET, CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWTIME, + CLONE_NEWUSER, CLONE_NEWUTS, +}; + +use crate::backend::c::c_int; +use crate::backend::thread::syscalls; +use crate::fd::BorrowedFd; +use crate::io; + +bitflags! { + /// Thread name space type. + pub struct ThreadNameSpaceType: u32 { + /// Time name space. + const TIME = CLONE_NEWTIME; + /// Mount name space. + const MOUNT = CLONE_NEWNS; + /// Control group (CGroup) name space. + const CONTROL_GROUP = CLONE_NEWCGROUP; + /// `Host name` and `NIS domain name` (UTS) name space. + const HOST_NAME_AND_NIS_DOMAIN_NAME = CLONE_NEWUTS; + /// Inter-process communication (IPC) name space. + const INTER_PROCESS_COMMUNICATION = CLONE_NEWIPC; + /// User name space. + const USER = CLONE_NEWUSER; + /// Process ID name space. + const PROCESS_ID = CLONE_NEWPID; + /// Network name space. + const NETWORK = CLONE_NEWNET; + } +} + +/// Type of name space referred to by a link. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(u32)] +pub enum LinkNameSpaceType { + /// Time name space. + Time = CLONE_NEWTIME, + /// Mount name space. + Mount = CLONE_NEWNS, + /// Control group (CGroup) name space. + ControlGroup = CLONE_NEWCGROUP, + /// `Host name` and `NIS domain name` (UTS) name space. + HostNameAndNISDomainName = CLONE_NEWUTS, + /// Inter-process communication (IPC) name space. + InterProcessCommunication = CLONE_NEWIPC, + /// User name space. + User = CLONE_NEWUSER, + /// Process ID name space. + ProcessID = CLONE_NEWPID, + /// Network name space. + Network = CLONE_NEWNET, +} + +/// Reassociate the calling thread with the namespace associated with link referred to by `fd`. +/// +/// `fd` must refer to one of the magic links in a `/proc/[pid]/ns/` directory, or a bind mount +/// to such a link. +/// +/// # References +/// - [`setns`] +/// +/// [`setns`]: https://man7.org/linux/man-pages/man2/setns.2.html +pub fn move_into_link_name_space( + fd: BorrowedFd, + allowed_type: Option, +) -> io::Result<()> { + let allowed_type = allowed_type.map_or(0, |t| t as c_int); + syscalls::setns(fd, allowed_type).map(|_r| ()) +} + +/// Atomically move the calling thread into one or more of the same namespaces as the thread +/// referred to by `fd`. +/// +/// `fd` must refer to a thread ID. See: `pidfd_open` and `clone`. +/// +/// # References +/// - [`setns`] +/// +/// [`setns`]: https://man7.org/linux/man-pages/man2/setns.2.html +pub fn move_into_thread_name_spaces( + fd: BorrowedFd, + allowed_types: ThreadNameSpaceType, +) -> io::Result<()> { + syscalls::setns(fd, allowed_types.bits() as c_int).map(|_r| ()) +} -- cgit v1.2.3