diff options
Diffstat (limited to 'src/shared/seccomp-util.c')
-rw-r--r-- | src/shared/seccomp-util.c | 2353 |
1 files changed, 2353 insertions, 0 deletions
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c new file mode 100644 index 0000000..77cacb3 --- /dev/null +++ b/src/shared/seccomp-util.c @@ -0,0 +1,2353 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <linux/seccomp.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/shm.h> +#include <sys/stat.h> + +/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */ +#include "missing_syscall_def.h" +#include <seccomp.h> + +#include "af-list.h" +#include "alloc-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "macro.h" +#include "namespace-util.h" +#include "nsflags.h" +#include "nulstr-util.h" +#include "process-util.h" +#include "seccomp-util.h" +#include "set.h" +#include "string-util.h" +#include "strv.h" + +/* This array will be modified at runtime as seccomp_restrict_archs is called. */ +uint32_t seccomp_local_archs[] = { + + /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */ + +#if defined(__x86_64__) && defined(__ILP32__) + SCMP_ARCH_X86, + SCMP_ARCH_X86_64, + SCMP_ARCH_X32, /* native */ +#elif defined(__x86_64__) && !defined(__ILP32__) + SCMP_ARCH_X86, + SCMP_ARCH_X32, + SCMP_ARCH_X86_64, /* native */ +#elif defined(__i386__) + SCMP_ARCH_X86, +#elif defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, /* native */ +#elif defined(__arm__) + SCMP_ARCH_ARM, +#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32 + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32 + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64 + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL64N32, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPS64, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64 + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL64N32, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPSEL64, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32 + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPSEL64N32, + SCMP_ARCH_MIPS64N32, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32 + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL64N32, /* native */ +#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64) + SCMP_ARCH_PARISC, + SCMP_ARCH_PARISC64, /* native */ +#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC) + SCMP_ARCH_PARISC, +#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64LE, + SCMP_ARCH_PPC64, /* native */ +#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64, + SCMP_ARCH_PPC64LE, /* native */ +#elif defined(__powerpc__) + SCMP_ARCH_PPC, +#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64) + SCMP_ARCH_RISCV64, +#elif defined(__s390x__) + SCMP_ARCH_S390, + SCMP_ARCH_S390X, /* native */ +#elif defined(__s390__) + SCMP_ARCH_S390, +#endif + SECCOMP_LOCAL_ARCH_END + }; + +const char* seccomp_arch_to_string(uint32_t c) { + /* Maintain order used in <seccomp.h>. + * + * Names used here should be the same as those used for ConditionArchitecture=, + * except for "subarchitectures" like x32. */ + + switch (c) { + case SCMP_ARCH_NATIVE: + return "native"; + case SCMP_ARCH_X86: + return "x86"; + case SCMP_ARCH_X86_64: + return "x86-64"; + case SCMP_ARCH_X32: + return "x32"; + case SCMP_ARCH_ARM: + return "arm"; + case SCMP_ARCH_AARCH64: + return "arm64"; + case SCMP_ARCH_MIPS: + return "mips"; + case SCMP_ARCH_MIPS64: + return "mips64"; + case SCMP_ARCH_MIPS64N32: + return "mips64-n32"; + case SCMP_ARCH_MIPSEL: + return "mips-le"; + case SCMP_ARCH_MIPSEL64: + return "mips64-le"; + case SCMP_ARCH_MIPSEL64N32: + return "mips64-le-n32"; +#ifdef SCMP_ARCH_PARISC + case SCMP_ARCH_PARISC: + return "parisc"; +#endif +#ifdef SCMP_ARCH_PARISC64 + case SCMP_ARCH_PARISC64: + return "parisc64"; +#endif + case SCMP_ARCH_PPC: + return "ppc"; + case SCMP_ARCH_PPC64: + return "ppc64"; + case SCMP_ARCH_PPC64LE: + return "ppc64-le"; +#ifdef SCMP_ARCH_RISCV64 + case SCMP_ARCH_RISCV64: + return "riscv64"; +#endif + case SCMP_ARCH_S390: + return "s390"; + case SCMP_ARCH_S390X: + return "s390x"; + default: + return NULL; + } +} + +int seccomp_arch_from_string(const char *n, uint32_t *ret) { + if (!n) + return -EINVAL; + + assert(ret); + + if (streq(n, "native")) + *ret = SCMP_ARCH_NATIVE; + else if (streq(n, "x86")) + *ret = SCMP_ARCH_X86; + else if (streq(n, "x86-64")) + *ret = SCMP_ARCH_X86_64; + else if (streq(n, "x32")) + *ret = SCMP_ARCH_X32; + else if (streq(n, "arm")) + *ret = SCMP_ARCH_ARM; + else if (streq(n, "arm64")) + *ret = SCMP_ARCH_AARCH64; + else if (streq(n, "mips")) + *ret = SCMP_ARCH_MIPS; + else if (streq(n, "mips64")) + *ret = SCMP_ARCH_MIPS64; + else if (streq(n, "mips64-n32")) + *ret = SCMP_ARCH_MIPS64N32; + else if (streq(n, "mips-le")) + *ret = SCMP_ARCH_MIPSEL; + else if (streq(n, "mips64-le")) + *ret = SCMP_ARCH_MIPSEL64; + else if (streq(n, "mips64-le-n32")) + *ret = SCMP_ARCH_MIPSEL64N32; +#ifdef SCMP_ARCH_PARISC + else if (streq(n, "parisc")) + *ret = SCMP_ARCH_PARISC; +#endif +#ifdef SCMP_ARCH_PARISC64 + else if (streq(n, "parisc64")) + *ret = SCMP_ARCH_PARISC64; +#endif + else if (streq(n, "ppc")) + *ret = SCMP_ARCH_PPC; + else if (streq(n, "ppc64")) + *ret = SCMP_ARCH_PPC64; + else if (streq(n, "ppc64-le")) + *ret = SCMP_ARCH_PPC64LE; +#ifdef SCMP_ARCH_RISCV64 + else if (streq(n, "riscv64")) + *ret = SCMP_ARCH_RISCV64; +#endif + else if (streq(n, "s390")) + *ret = SCMP_ARCH_S390; + else if (streq(n, "s390x")) + *ret = SCMP_ARCH_S390X; + else + return -EINVAL; + + return 0; +} + +int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int r; + + /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting + * any others. Also, turns off the NNP fiddling. */ + + seccomp = seccomp_init(default_action); + if (!seccomp) + return -ENOMEM; + + if (arch != SCMP_ARCH_NATIVE && + arch != seccomp_arch_native()) { + + r = seccomp_arch_remove(seccomp, seccomp_arch_native()); + if (r < 0) + return r; + + r = seccomp_arch_add(seccomp, arch); + if (r < 0) + return r; + + assert(seccomp_arch_exist(seccomp, arch) >= 0); + assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST); + assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST); + } else { + assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0); + assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0); + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + return r; + +#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4) + if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) { + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1); + if (r < 0) + log_debug_errno(r, "Failed to enable seccomp event logging: %m"); + } +#endif + + *ret = TAKE_PTR(seccomp); + return 0; +} + +static bool is_basic_seccomp_available(void) { + return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0; +} + +static bool is_seccomp_filter_available(void) { + return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 && + errno == EFAULT; +} + +bool is_seccomp_available(void) { + static int cached_enabled = -1; + + if (cached_enabled < 0) { + int b; + + b = getenv_bool_secure("SYSTEMD_SECCOMP"); + if (b != 0) { + if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */ + log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring."); + + cached_enabled = + is_basic_seccomp_available() && + is_seccomp_filter_available(); + } else + cached_enabled = false; + } + + return cached_enabled; +} + +const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_DEFAULT] = { + .name = "@default", + .help = "System calls that are always permitted", + .value = + "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */ + "brk\0" + "cacheflush\0" + "clock_getres\0" + "clock_getres_time64\0" + "clock_gettime\0" + "clock_gettime64\0" + "clock_nanosleep\0" + "clock_nanosleep_time64\0" + "execve\0" + "exit\0" + "exit_group\0" + "futex\0" + "futex_time64\0" + "futex_waitv\0" + "get_robust_list\0" + "get_thread_area\0" + "getegid\0" + "getegid32\0" + "geteuid\0" + "geteuid32\0" + "getgid\0" + "getgid32\0" + "getgroups\0" + "getgroups32\0" + "getpgid\0" + "getpgrp\0" + "getpid\0" + "getppid\0" + "getrandom\0" + "getresgid\0" + "getresgid32\0" + "getresuid\0" + "getresuid32\0" + "getrlimit\0" /* make sure processes can query stack size and such */ + "getsid\0" + "gettid\0" + "gettimeofday\0" + "getuid\0" + "getuid32\0" + "membarrier\0" + "mmap\0" + "mmap2\0" + "mprotect\0" + "munmap\0" + "nanosleep\0" + "pause\0" + "prlimit64\0" + "restart_syscall\0" + "riscv_flush_icache\0" + "riscv_hwprobe\0" + "rseq\0" + "rt_sigreturn\0" + "sched_getaffinity\0" + "sched_yield\0" + "set_robust_list\0" + "set_thread_area\0" + "set_tid_address\0" + "set_tls\0" + "sigreturn\0" + "time\0" + "ugetrlimit\0" + }, + [SYSCALL_FILTER_SET_AIO] = { + .name = "@aio", + .help = "Asynchronous IO", + .value = + "io_cancel\0" + "io_destroy\0" + "io_getevents\0" + "io_pgetevents\0" + "io_pgetevents_time64\0" + "io_setup\0" + "io_submit\0" + "io_uring_enter\0" + "io_uring_register\0" + "io_uring_setup\0" + }, + [SYSCALL_FILTER_SET_BASIC_IO] = { + .name = "@basic-io", + .help = "Basic IO", + .value = + "_llseek\0" + "close\0" + "close_range\0" + "dup\0" + "dup2\0" + "dup3\0" + "lseek\0" + "pread64\0" + "preadv\0" + "preadv2\0" + "pwrite64\0" + "pwritev\0" + "pwritev2\0" + "read\0" + "readv\0" + "write\0" + "writev\0" + }, + [SYSCALL_FILTER_SET_CHOWN] = { + .name = "@chown", + .help = "Change ownership of files and directories", + .value = + "chown\0" + "chown32\0" + "fchown\0" + "fchown32\0" + "fchownat\0" + "lchown\0" + "lchown32\0" + }, + [SYSCALL_FILTER_SET_CLOCK] = { + .name = "@clock", + .help = "Change the system time", + .value = + "adjtimex\0" + "clock_adjtime\0" + "clock_adjtime64\0" + "clock_settime\0" + "clock_settime64\0" + "settimeofday\0" + }, + [SYSCALL_FILTER_SET_CPU_EMULATION] = { + .name = "@cpu-emulation", + .help = "System calls for CPU emulation functionality", + .value = + "modify_ldt\0" + "subpage_prot\0" + "switch_endian\0" + "vm86\0" + "vm86old\0" + }, + [SYSCALL_FILTER_SET_DEBUG] = { + .name = "@debug", + .help = "Debugging, performance monitoring and tracing functionality", + .value = + "lookup_dcookie\0" + "perf_event_open\0" + "pidfd_getfd\0" + "ptrace\0" + "rtas\0" + "s390_runtime_instr\0" + "sys_debug_setcontext\0" + }, + [SYSCALL_FILTER_SET_FILE_SYSTEM] = { + .name = "@file-system", + .help = "File system operations", + .value = + "access\0" + "chdir\0" + "chmod\0" + "close\0" + "creat\0" + "faccessat\0" + "faccessat2\0" + "fallocate\0" + "fchdir\0" + "fchmod\0" + "fchmodat\0" + "fcntl\0" + "fcntl64\0" + "fgetxattr\0" + "flistxattr\0" + "fremovexattr\0" + "fsetxattr\0" + "fstat\0" + "fstat64\0" + "fstatat64\0" + "fstatfs\0" + "fstatfs64\0" + "ftruncate\0" + "ftruncate64\0" + "futimesat\0" + "getcwd\0" + "getdents\0" + "getdents64\0" + "getxattr\0" + "inotify_add_watch\0" + "inotify_init\0" + "inotify_init1\0" + "inotify_rm_watch\0" + "lgetxattr\0" + "link\0" + "linkat\0" + "listxattr\0" + "llistxattr\0" + "lremovexattr\0" + "lsetxattr\0" + "lstat\0" + "lstat64\0" + "mkdir\0" + "mkdirat\0" + "mknod\0" + "mknodat\0" + "newfstatat\0" + "oldfstat\0" + "oldlstat\0" + "oldstat\0" + "open\0" + "openat\0" + "openat2\0" + "readlink\0" + "readlinkat\0" + "removexattr\0" + "rename\0" + "renameat\0" + "renameat2\0" + "rmdir\0" + "setxattr\0" + "stat\0" + "stat64\0" + "statfs\0" + "statfs64\0" + "statx\0" + "symlink\0" + "symlinkat\0" + "truncate\0" + "truncate64\0" + "unlink\0" + "unlinkat\0" + "utime\0" + "utimensat\0" + "utimensat_time64\0" + "utimes\0" + }, + [SYSCALL_FILTER_SET_IO_EVENT] = { + .name = "@io-event", + .help = "Event loop system calls", + .value = + "_newselect\0" + "epoll_create\0" + "epoll_create1\0" + "epoll_ctl\0" + "epoll_ctl_old\0" + "epoll_pwait\0" + "epoll_pwait2\0" + "epoll_wait\0" + "epoll_wait_old\0" + "eventfd\0" + "eventfd2\0" + "poll\0" + "ppoll\0" + "ppoll_time64\0" + "pselect6\0" + "pselect6_time64\0" + "select\0" + }, + [SYSCALL_FILTER_SET_IPC] = { + .name = "@ipc", + .help = "SysV IPC, POSIX Message Queues or other IPC", + .value = + "ipc\0" + "memfd_create\0" + "mq_getsetattr\0" + "mq_notify\0" + "mq_open\0" + "mq_timedreceive\0" + "mq_timedreceive_time64\0" + "mq_timedsend\0" + "mq_timedsend_time64\0" + "mq_unlink\0" + "msgctl\0" + "msgget\0" + "msgrcv\0" + "msgsnd\0" + "pipe\0" + "pipe2\0" + "process_madvise\0" + "process_vm_readv\0" + "process_vm_writev\0" + "semctl\0" + "semget\0" + "semop\0" + "semtimedop\0" + "semtimedop_time64\0" + "shmat\0" + "shmctl\0" + "shmdt\0" + "shmget\0" + }, + [SYSCALL_FILTER_SET_KEYRING] = { + .name = "@keyring", + .help = "Kernel keyring access", + .value = + "add_key\0" + "keyctl\0" + "request_key\0" + }, + [SYSCALL_FILTER_SET_MEMLOCK] = { + .name = "@memlock", + .help = "Memory locking control", + .value = + "mlock\0" + "mlock2\0" + "mlockall\0" + "munlock\0" + "munlockall\0" + }, + [SYSCALL_FILTER_SET_MODULE] = { + .name = "@module", + .help = "Loading and unloading of kernel modules", + .value = + "delete_module\0" + "finit_module\0" + "init_module\0" + }, + [SYSCALL_FILTER_SET_MOUNT] = { + .name = "@mount", + .help = "Mounting and unmounting of file systems", + .value = + "chroot\0" + "fsconfig\0" + "fsmount\0" + "fsopen\0" + "fspick\0" + "mount\0" + "mount_setattr\0" + "move_mount\0" + "open_tree\0" + "pivot_root\0" + "umount\0" + "umount2\0" + }, + [SYSCALL_FILTER_SET_NETWORK_IO] = { + .name = "@network-io", + .help = "Network or Unix socket IO, should not be needed if not network facing", + .value = + "accept\0" + "accept4\0" + "bind\0" + "connect\0" + "getpeername\0" + "getsockname\0" + "getsockopt\0" + "listen\0" + "recv\0" + "recvfrom\0" + "recvmmsg\0" + "recvmmsg_time64\0" + "recvmsg\0" + "send\0" + "sendmmsg\0" + "sendmsg\0" + "sendto\0" + "setsockopt\0" + "shutdown\0" + "socket\0" + "socketcall\0" + "socketpair\0" + }, + [SYSCALL_FILTER_SET_OBSOLETE] = { + /* some unknown even to libseccomp */ + .name = "@obsolete", + .help = "Unusual, obsolete or unimplemented system calls", + .value = + "_sysctl\0" + "afs_syscall\0" + "bdflush\0" + "break\0" + "create_module\0" + "ftime\0" + "get_kernel_syms\0" + "getpmsg\0" + "gtty\0" + "idle\0" + "lock\0" + "mpx\0" + "prof\0" + "profil\0" + "putpmsg\0" + "query_module\0" + "security\0" + "sgetmask\0" + "ssetmask\0" + "stime\0" + "stty\0" + "sysfs\0" + "tuxcall\0" + "ulimit\0" + "uselib\0" + "ustat\0" + "vserver\0" + }, + [SYSCALL_FILTER_SET_PKEY] = { + .name = "@pkey", + .help = "System calls used for memory protection keys", + .value = + "pkey_alloc\0" + "pkey_free\0" + "pkey_mprotect\0" + }, + [SYSCALL_FILTER_SET_PRIVILEGED] = { + .name = "@privileged", + .help = "All system calls which need super-user capabilities", + .value = + "@chown\0" + "@clock\0" + "@module\0" + "@raw-io\0" + "@reboot\0" + "@swap\0" + "_sysctl\0" + "acct\0" + "bpf\0" + "capset\0" + "chroot\0" + "fanotify_init\0" + "fanotify_mark\0" + "nfsservctl\0" + "open_by_handle_at\0" + "pivot_root\0" + "quotactl\0" + "quotactl_fd\0" + "setdomainname\0" + "setfsuid\0" + "setfsuid32\0" + "setgroups\0" + "setgroups32\0" + "sethostname\0" + "setresuid\0" + "setresuid32\0" + "setreuid\0" + "setreuid32\0" + "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */ + "setuid32\0" + "vhangup\0" + }, + [SYSCALL_FILTER_SET_PROCESS] = { + .name = "@process", + .help = "Process control, execution, namespacing operations", + .value = + "capget\0" /* Able to query arbitrary processes */ + "clone\0" + /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't + * implement seccomp, so we don't need to list it at all. C.f. + * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */ + "clone3\0" + "execveat\0" + "fork\0" + "getrusage\0" + "kill\0" + "pidfd_open\0" + "pidfd_send_signal\0" + "prctl\0" + "rt_sigqueueinfo\0" + "rt_tgsigqueueinfo\0" + "setns\0" + "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */ + "tgkill\0" + "times\0" + "tkill\0" + "unshare\0" + "vfork\0" + "wait4\0" + "waitid\0" + "waitpid\0" + }, + [SYSCALL_FILTER_SET_RAW_IO] = { + .name = "@raw-io", + .help = "Raw I/O port access", + .value = + "ioperm\0" + "iopl\0" + "pciconfig_iobase\0" + "pciconfig_read\0" + "pciconfig_write\0" + "s390_pci_mmio_read\0" + "s390_pci_mmio_write\0" + }, + [SYSCALL_FILTER_SET_REBOOT] = { + .name = "@reboot", + .help = "Reboot and reboot preparation/kexec", + .value = + "kexec_file_load\0" + "kexec_load\0" + "reboot\0" + }, + [SYSCALL_FILTER_SET_RESOURCES] = { + .name = "@resources", + .help = "Alter resource settings", + .value = + "ioprio_set\0" + "mbind\0" + "migrate_pages\0" + "move_pages\0" + "nice\0" + "sched_setaffinity\0" + "sched_setattr\0" + "sched_setparam\0" + "sched_setscheduler\0" + "set_mempolicy\0" + "set_mempolicy_home_node\0" + "setpriority\0" + "setrlimit\0" + }, + [SYSCALL_FILTER_SET_SETUID] = { + .name = "@setuid", + .help = "Operations for changing user/group credentials", + .value = + "setgid\0" + "setgid32\0" + "setgroups\0" + "setgroups32\0" + "setregid\0" + "setregid32\0" + "setresgid\0" + "setresgid32\0" + "setresuid\0" + "setresuid32\0" + "setreuid\0" + "setreuid32\0" + "setuid\0" + "setuid32\0" + }, + [SYSCALL_FILTER_SET_SIGNAL] = { + .name = "@signal", + .help = "Process signal handling", + .value = + "rt_sigaction\0" + "rt_sigpending\0" + "rt_sigprocmask\0" + "rt_sigsuspend\0" + "rt_sigtimedwait\0" + "rt_sigtimedwait_time64\0" + "sigaction\0" + "sigaltstack\0" + "signal\0" + "signalfd\0" + "signalfd4\0" + "sigpending\0" + "sigprocmask\0" + "sigsuspend\0" + }, + [SYSCALL_FILTER_SET_SWAP] = { + .name = "@swap", + .help = "Enable/disable swap devices", + .value = + "swapoff\0" + "swapon\0" + }, + [SYSCALL_FILTER_SET_SYNC] = { + .name = "@sync", + .help = "Synchronize files and memory to storage", + .value = + "fdatasync\0" + "fsync\0" + "msync\0" + "sync\0" + "sync_file_range\0" + "sync_file_range2\0" + "syncfs\0" + }, + [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = { + .name = "@system-service", + .help = "General system service operations", + .value = + "@aio\0" + "@basic-io\0" + "@chown\0" + "@default\0" + "@file-system\0" + "@io-event\0" + "@ipc\0" + "@keyring\0" + "@memlock\0" + "@network-io\0" + "@process\0" + "@resources\0" + "@setuid\0" + "@signal\0" + "@sync\0" + "@timer\0" + "arm_fadvise64_64\0" + "capget\0" + "capset\0" + "copy_file_range\0" + "fadvise64\0" + "fadvise64_64\0" + "flock\0" + "get_mempolicy\0" + "getcpu\0" + "getpriority\0" + "ioctl\0" + "ioprio_get\0" + "kcmp\0" + "madvise\0" + "mremap\0" + "name_to_handle_at\0" + "oldolduname\0" + "olduname\0" + "personality\0" + "readahead\0" + "readdir\0" + "remap_file_pages\0" + "sched_get_priority_max\0" + "sched_get_priority_min\0" + "sched_getattr\0" + "sched_getparam\0" + "sched_getscheduler\0" + "sched_rr_get_interval\0" + "sched_rr_get_interval_time64\0" + "sched_yield\0" + "sendfile\0" + "sendfile64\0" + "setfsgid\0" + "setfsgid32\0" + "setfsuid\0" + "setfsuid32\0" + "setpgid\0" + "setsid\0" + "splice\0" + "sysinfo\0" + "tee\0" + "umask\0" + "uname\0" + "userfaultfd\0" + "vmsplice\0" + }, + [SYSCALL_FILTER_SET_TIMER] = { + .name = "@timer", + .help = "Schedule operations by time", + .value = + "alarm\0" + "getitimer\0" + "setitimer\0" + "timer_create\0" + "timer_delete\0" + "timer_getoverrun\0" + "timer_gettime\0" + "timer_gettime64\0" + "timer_settime\0" + "timer_settime64\0" + "timerfd_create\0" + "timerfd_gettime\0" + "timerfd_gettime64\0" + "timerfd_settime\0" + "timerfd_settime64\0" + "times\0" + }, + [SYSCALL_FILTER_SET_KNOWN] = { + .name = "@known", + .help = "All known syscalls declared in the kernel", + .value = + "@obsolete\0" +#include "syscall-list.h" + }, +}; + +const SyscallFilterSet *syscall_filter_set_find(const char *name) { + if (isempty(name) || name[0] != '@') + return NULL; + + for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) + if (streq(syscall_filter_sets[i].name, name)) + return syscall_filter_sets + i; + + return NULL; +} + +static int add_syscall_filter_set( + scmp_filter_ctx seccomp, + const SyscallFilterSet *set, + uint32_t action, + char **exclude, + bool log_missing, + char ***added); + +int seccomp_add_syscall_filter_item( + scmp_filter_ctx *seccomp, + const char *name, + uint32_t action, + char **exclude, + bool log_missing, + char ***added) { + + assert(seccomp); + assert(name); + + if (strv_contains(exclude, name)) + return 0; + + /* Any syscalls that are handled are added to the *added strv. The pointer + * must be either NULL or point to a valid pre-initialized possibly-empty strv. */ + + if (name[0] == '@') { + const SyscallFilterSet *other; + + other = syscall_filter_set_find(name); + if (!other) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Filter set %s is not known!", + name); + + return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added); + + } else { + int id, r; + + id = seccomp_syscall_resolve_name(name); + if (id == __NR_SCMP_ERROR) { + if (log_missing) + log_debug("System call %s is not known, ignoring.", name); + return 0; + } + + r = seccomp_rule_add_exact(seccomp, action, id, 0); + if (r < 0) { + /* If the system call is not known on this architecture, then that's fine, let's ignore it */ + bool ignore = r == -EDOM; + + if (!ignore || log_missing) + log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m", + name, id, ignore ? ", ignoring" : ""); + if (!ignore) + return r; + } + + if (added) { + r = strv_extend(added, name); + if (r < 0) + return r; + } + + return 0; + } +} + +static int add_syscall_filter_set( + scmp_filter_ctx seccomp, + const SyscallFilterSet *set, + uint32_t action, + char **exclude, + bool log_missing, + char ***added) { + + const char *sys; + int r; + + /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */ + + assert(seccomp); + assert(set); + + NULSTR_FOREACH(sys, set->value) { + r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added); + if (r < 0) + return r; + } + + return 0; +} + +int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) { + uint32_t arch; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for + * each local arch. */ + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, default_action); + if (r < 0) + return r; + + r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to add filter set: %m"); + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) { + uint32_t arch; + int r; + + /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead + * of a SyscallFilterSet* table. */ + + if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW) + return 0; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + void *syscall_id, *val; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, default_action); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(val, syscall_id, filter) { + uint32_t a = action; + int id = PTR_TO_INT(syscall_id) - 1; + int error = PTR_TO_INT(val); + + if (error == SECCOMP_ERROR_NUMBER_KILL) + a = scmp_act_kill_process(); +#ifdef SCMP_ACT_LOG + else if (action == SCMP_ACT_LOG) + a = SCMP_ACT_LOG; +#endif + else if (error >= 0) + a = SCMP_ACT_ERRNO(error); + + r = seccomp_rule_add_exact(seccomp, a, id, 0); + if (r < 0) { + /* If the system call is not known on this architecture, then that's + * fine, let's ignore it */ + _cleanup_free_ char *n = NULL; + bool ignore; + + n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id); + ignore = r == -EDOM; + if (!ignore || log_missing) + log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m", + strna(n), id, ignore ? ", ignoring" : ""); + if (!ignore) + return r; + } + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_parse_syscall_filter( + const char *name, + int errno_num, + Hashmap *filter, + SeccompParseFlags flags, + const char *unit, + const char *filename, + unsigned line) { + + int r; + + assert(name); + assert(filter); + + if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0) + return -EINVAL; + + if (name[0] == '@') { + const SyscallFilterSet *set; + const char *i; + + set = syscall_filter_set_find(name); + if (!set) { + if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE)) + return -EINVAL; + + log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "Unknown system call group, ignoring: %s", name); + return 0; + } + + NULSTR_FOREACH(i, set->value) { + /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take + * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem, + * not a problem in user configuration data and we shouldn't pretend otherwise by complaining + * about them. */ + r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line); + if (r < 0) + return r; + } + } else { + int id; + + id = seccomp_syscall_resolve_name(name); + if (id == __NR_SCMP_ERROR) { + if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE)) + return -EINVAL; + + log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "Failed to parse system call, ignoring: %s", name); + return 0; + } + + /* If we previously wanted to forbid a syscall and now we want to allow it, then remove + * it from the list. The entries in allow-list with non-negative error value will be + * handled with SCMP_ACT_ERRNO() instead of the default action. */ + if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) || + (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) { + r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)); + if (r < 0) + switch (r) { + case -ENOMEM: + return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM; + case -EEXIST: + assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0); + break; + default: + return r; + } + } else + (void) hashmap_remove(filter, INT_TO_PTR(id + 1)); + } + + return 0; +} + +int seccomp_restrict_namespaces(unsigned long retain) { + uint32_t arch; + int r; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *s = NULL; + + (void) namespace_flags_to_string(retain, &s); + log_debug("Restricting namespace to: %s.", strna(s)); + } + + /* NOOP? */ + if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL)) + return 0; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + /* We cannot filter on individual flags to clone3(), and we need to disable the + * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other + * users shall fall back to clone(), as if on an older kernel. + * + * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330, + * https://github.com/moby/moby/issues/42680. */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(clone3), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch)); + + if ((retain & NAMESPACE_FLAGS_ALL) == 0) + /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall + * altogether. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 0); + else + /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the + * special invocation with a zero flags argument, right here. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_EQ, 0)); + if (r < 0) { + log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + for (unsigned i = 0; namespace_info[i].proc_name; i++) { + unsigned long f; + + f = namespace_info[i].clone_flag; + if (FLAGS_SET(retain, f)) { + log_debug("Permitting %s.", namespace_info[i].proc_name); + continue; + } + + log_trace("Blocking %s.", namespace_info[i].proc_name); + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(unshare), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + break; + } + + /* On s390/s390x the first two parameters to clone are switched */ + if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X)) + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(clone), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + else + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(clone), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + break; + } + + if ((retain & NAMESPACE_FLAGS_ALL) != 0) { + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + break; + } + } + } + if (r < 0) + continue; + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_sysctl(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + if (IN_SET(arch, + SCMP_ARCH_AARCH64, +#ifdef SCMP_ARCH_RISCV64 + SCMP_ARCH_RISCV64, +#endif + SCMP_ARCH_X32 + )) + /* No _sysctl syscall */ + continue; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(_sysctl), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_syslog(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(syslog), + 0); + + if (r < 0) { + log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_restrict_address_families(Set *address_families, bool allow_list) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + bool supported; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + switch (arch) { + + case SCMP_ARCH_X86_64: + case SCMP_ARCH_X32: + case SCMP_ARCH_ARM: + case SCMP_ARCH_AARCH64: + case SCMP_ARCH_MIPSEL64N32: + case SCMP_ARCH_MIPS64N32: + case SCMP_ARCH_MIPSEL64: + case SCMP_ARCH_MIPS64: +#ifdef SCMP_ARCH_RISCV64 + case SCMP_ARCH_RISCV64: +#endif + /* These we know we support (i.e. are the ones that do not use socketcall()) */ + supported = true; + break; + + case SCMP_ARCH_S390: + case SCMP_ARCH_S390X: + case SCMP_ARCH_X86: + case SCMP_ARCH_MIPSEL: + case SCMP_ARCH_MIPS: +#ifdef SCMP_ARCH_PARISC + case SCMP_ARCH_PARISC: +#endif +#ifdef SCMP_ARCH_PARISC64 + case SCMP_ARCH_PARISC64: +#endif + case SCMP_ARCH_PPC: + case SCMP_ARCH_PPC64: + case SCMP_ARCH_PPC64LE: + default: + /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we + * don't know */ + supported = false; + break; + } + + if (!supported) + continue; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + if (allow_list) { + int first = 0, last = 0; + void *afp; + + /* If this is an allow list, we first block the address families that are out of + * range and then everything that is not in the set. First, we find the lowest and + * highest address family in the set. */ + + SET_FOREACH(afp, address_families) { + int af = PTR_TO_INT(afp); + + if (af <= 0 || af >= af_max()) + continue; + + if (first == 0 || af < first) + first = af; + + if (last == 0 || af > last) + last = af; + } + + assert((first == 0) == (last == 0)); + + if (first == 0) { + + /* No entries in the valid range, block everything */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + } else { + + /* Block everything below the first entry */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_LT, first)); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + /* Block everything above the last entry */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_GT, last)); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + /* Block everything between the first and last entry */ + for (int af = 1; af < af_max(); af++) { + + if (set_contains(address_families, INT_TO_PTR(af))) + continue; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_EQ, af)); + if (r < 0) + break; + } + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + } + + } else { + void *af; + + /* If this is a deny list, then generate one rule for each address family that are + * then combined in OR checks. */ + + SET_FOREACH(af, address_families) { + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af))); + if (r < 0) + break; + } + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_restrict_realtime_full(int error_code) { + static const int permitted_policies[] = { + SCHED_OTHER, + SCHED_BATCH, + SCHED_IDLE, + }; + + int r, max_policy = 0; + uint32_t arch; + unsigned i; + + assert(error_code > 0); + + /* Determine the highest policy constant we want to allow */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] > max_policy) + max_policy = permitted_policies[i]; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int p; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + /* Go through all policies with lower values than that, and block them -- unless they appear in the + * allow list. */ + for (p = 0; p < max_policy; p++) { + bool good = false; + + /* Check if this is in the allow list. */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] == p) { + good = true; + break; + } + + if (good) + continue; + + /* Deny this policy */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(error_code), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_EQ, p)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + } + + /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons + * are unsigned here, hence no need no check for < 0 values. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(error_code), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_GT, max_policy)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp, + uint32_t arch, + int nr, + unsigned arg_cnt, + const struct scmp_arg_cmp arg) { + int r; + + r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg); + if (r < 0) { + _cleanup_free_ char *n = NULL; + + n = seccomp_syscall_resolve_num_arch(arch, nr); + log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m", + strna(n), + seccomp_arch_to_string(arch)); + } + + return r; +} + +/* For known architectures, check that syscalls are indeed defined or not. */ +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64) +assert_cc(SCMP_SYS(shmget) > 0); +assert_cc(SCMP_SYS(shmat) > 0); +assert_cc(SCMP_SYS(shmdt) > 0); +#endif + +int seccomp_memory_deny_write_execute(void) { + uint32_t arch; + unsigned loaded = 0; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + switch (arch) { + + /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc(). + * We ignore that here, which means there's still a way to get writable/executable + * memory, if an IPC key is mapped like this. That's a pity, but no total loss. + * + * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress + * on that front (kernel work done in 5.18). + */ + + case SCMP_ARCH_X86: + case SCMP_ARCH_S390: + filter_syscall = SCMP_SYS(mmap2); + block_syscall = SCMP_SYS(mmap); + /* shmat multiplexed, see above */ + break; + + case SCMP_ARCH_PPC: + case SCMP_ARCH_PPC64: + case SCMP_ARCH_PPC64LE: + case SCMP_ARCH_S390X: + filter_syscall = SCMP_SYS(mmap); + /* shmat multiplexed, see above */ + break; + + case SCMP_ARCH_ARM: + filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */ + shmat_syscall = SCMP_SYS(shmat); + break; + + case SCMP_ARCH_X86_64: + case SCMP_ARCH_X32: + case SCMP_ARCH_AARCH64: +#ifdef SCMP_ARCH_RISCV64 + case SCMP_ARCH_RISCV64: +#endif + filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */ + shmat_syscall = SCMP_SYS(shmat); + break; + + /* Please add more definitions here, if you port systemd to other architectures! */ + +#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) +#warning "Consider adding the right mmap() syscall definitions here!" +#endif + } + + /* Can't filter mmap() on this arch, then skip it */ + if (filter_syscall == 0) + continue; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall, + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE)); + if (r < 0) + continue; + + if (block_syscall != 0) { + r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} ); + if (r < 0) + continue; + } + + r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); + if (r < 0) + continue; + + r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); + if (r < 0) + continue; + + if (shmat_syscall > 0) { + r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall, + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC)); + if (r < 0) + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + loaded++; + } + + if (loaded == 0) + log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=."); + + return loaded; +} + +int seccomp_restrict_archs(Set *archs) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int r; + bool blocked_new = false; + + /* This installs a filter with no rules, but that restricts the system call architectures to the specified + * list. + * + * There are some qualifications. However the most important use is to stop processes from bypassing + * system call restrictions, in case they used a broader (multiplexing) syscall which is only available + * in a non-native architecture. There are no holes in this use case, at least so far. */ + + /* Note libseccomp includes our "native" (current) architecture in the filter by default. + * We do not remove it. For example, our callers expect to be able to call execve() afterwards + * to run a program with the restrictions applied. */ + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) { + uint32_t arch = seccomp_local_archs[i]; + + /* See above comment, our "native" architecture is never blocked. */ + if (arch == seccomp_arch_native()) + continue; + + /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */ + if (arch == SECCOMP_LOCAL_ARCH_BLOCKED) + continue; + + bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1)); + + /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32 + * x32 syscalls should basically match x86-64 for everything except the pointer type. + * The important thing is that you can block the old 32-bit x86 syscalls. + * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */ + if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32) + block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1)); + + if (block) { + seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED; + blocked_new = true; + } else { + r = seccomp_arch_add(seccomp, arch); + if (r < 0 && r != -EEXIST) + return r; + } + } + + /* All architectures that will be blocked by the seccomp program were + * already blocked. */ + if (!blocked_new) + return 0; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + return r; + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m"); + + return 0; +} + +int parse_syscall_archs(char **l, Set **ret_archs) { + _cleanup_set_free_ Set *archs = NULL; + int r; + + assert(l); + assert(ret_archs); + + STRV_FOREACH(s, l) { + uint32_t a; + + r = seccomp_arch_from_string(*s, &a); + if (r < 0) + return -EINVAL; + + r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1)); + if (r < 0) + return -ENOMEM; + } + + *ret_archs = TAKE_PTR(archs); + return 0; +} + +int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) { + const char *i; + int r; + + assert(set); + + NULSTR_FOREACH(i, set->value) { + + if (i[0] == '@') { + const SyscallFilterSet *more; + + more = syscall_filter_set_find(i); + if (!more) + return -ENXIO; + + r = seccomp_filter_set_add(filter, add, more); + if (r < 0) + return r; + } else { + int id; + + id = seccomp_syscall_resolve_name(i); + if (id == __NR_SCMP_ERROR) { + log_debug("Couldn't resolve system call, ignoring: %s", i); + continue; + } + + if (add) { + r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1)); + if (r < 0) + return r; + } else + (void) hashmap_remove(filter, INT_TO_PTR(id + 1)); + } + } + + return 0; +} + +int seccomp_lock_personality(unsigned long personality) { + uint32_t arch; + int r; + + if (personality >= PERSONALITY_INVALID) + return -EINVAL; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(personality), + 1, + SCMP_A0(SCMP_CMP_NE, personality)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_hostname(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(sethostname), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setdomainname), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) { + /* Checks the mode_t parameter of the following system calls: + * + * → chmod() + fchmod() + fchmodat() + * → open() + creat() + openat() + * → mkdir() + mkdirat() + * → mknod() + mknodat() + * + * Returns error if *everything* failed, and 0 otherwise. + */ + int r; + bool any = false; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(chmod), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for chmod: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(fchmod), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for fchmod: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(fchmodat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for fchmodat: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mkdir), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mkdir: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mkdirat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mkdirat: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mknod), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mknod: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mknodat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mknodat: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(open), + 2, + SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT), + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for open: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(openat), + 2, + SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT), + SCMP_A3(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat: %m"); + else + any = true; + +#if defined(__SNR_openat2) + /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into + * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do + * for now, since openat2() is very new and code generally needs fallback logic anyway to be + * compatible with kernels that are not absolutely recent. We would normally return EPERM for a + * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs + * to call open() or openat() instead. We can properly enforce policy for those functions. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(openat2), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat2: %m"); + else + any = true; +#endif + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(creat), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for creat: %m"); + else + any = true; + + return any ? 0 : r; +} + +int seccomp_restrict_suid_sgid(void) { + uint32_t arch; + int r, k; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_restrict_sxid(seccomp, S_ISUID); + if (r < 0) + log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch)); + + k = seccomp_restrict_sxid(seccomp, S_ISGID); + if (k < 0) + log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch)); + + if (r < 0 && k < 0) + continue; + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +uint32_t scmp_act_kill_process(void) { + + /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never + * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of + * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least + * for single-threaded apps does the right thing. */ + +#ifdef SCMP_ACT_KILL_PROCESS + if (seccomp_api_get() >= 3) + return SCMP_ACT_KILL_PROCESS; +#endif + + return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */ +} + +int parse_syscall_and_errno(const char *in, char **name, int *error) { + _cleanup_free_ char *n = NULL; + char *p; + int e = -1; + + assert(in); + assert(name); + assert(error); + + /* + * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255". + * If errno is omitted, then error is set to -1. + * Empty syscall name is not allowed. + * Here, we do not check that the syscall name is valid or not. + */ + + p = strchr(in, ':'); + if (p) { + e = seccomp_parse_errno_or_action(p + 1); + if (e < 0) + return e; + + n = strndup(in, p - in); + } else + n = strdup(in); + + if (!n) + return -ENOMEM; + + if (isempty(n)) + return -EINVAL; + + *error = e; + *name = TAKE_PTR(n); + + return 0; +} + +static int block_open_flag(scmp_filter_ctx seccomp, int flag) { + bool any = false; + int r; + + /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return + * EINVAL, in the hope the client code will retry without O_SYNC then. */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EINVAL), + SCMP_SYS(open), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for open: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EINVAL), + SCMP_SYS(openat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat: %m"); + else + any = true; + +#if defined(__SNR_openat2) + /* The new openat2() system call can't be filtered sensibly, see above. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(openat2), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat2: %m"); + else + any = true; +#endif + + return any ? 0 : r; +} + +int seccomp_suppress_sync(void) { + uint32_t arch; + int r; + + /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately + * manageable, and also masks O_SYNC/O_DSYNC */ + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + const char *c; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) { + int id; + + id = seccomp_syscall_resolve_name(c); + if (id == __NR_SCMP_ERROR) { + log_debug("System call %s is not known, ignoring.", c); + continue; + } + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */ + id, + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c); + } + + (void) block_open_flag(seccomp, O_SYNC); +#if O_DSYNC != O_SYNC + (void) block_open_flag(seccomp, O_DSYNC); +#endif + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} |