/* SPDX-License-Identifier: LGPL-2.1-or-later */ #include "userns-restrict.h" #if HAVE_VMLINUX_H #include #include "bpf-dlopen.h" #include "bpf-link.h" #include "fd-util.h" #include "fs-util.h" #include "lsm-util.h" #include "missing_mount.h" #include "mkdir.h" #include "mount-util.h" #include "mountpoint-util.h" #include "namespace-util.h" #include "path-util.h" #define USERNS_MAX (16U*1024U) #define MOUNTS_MAX 4096U #define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs" #define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps" struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) { (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */ return NULL; } static int make_inner_hash_map(void) { int fd; fd = compat_bpf_map_create( BPF_MAP_TYPE_HASH, NULL, sizeof(int), sizeof(uint32_t), MOUNTS_MAX, NULL); if (fd < 0) return log_debug_errno(errno, "Failed allocate inner BPF map: %m"); return fd; } int userns_restrict_install( bool pin, struct userns_restrict_bpf **ret) { _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL; _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF; int r; r = lsm_supported("bpf"); if (r < 0) return r; if (r == 0) return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace."); r = dlopen_bpf(); if (r < 0) return r; /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */ if (!sym_bpf_object__next_map) return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace."); obj = userns_restrict_bpf__open(); if (!obj) return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m"); if (pin) { struct bpf_map *map; /* libbpf will only create one level of dirs. Let's create the rest */ (void) mkdir_p(MAP_LINK_PREFIX, 0755); (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755); map = sym_bpf_object__next_map(obj->obj, NULL); while (map) { _cleanup_free_ char *fn = NULL; fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map)); if (!fn) return log_oom(); r = sym_bpf_map__set_pin_path(map, fn); if (r < 0) return log_error_errno(r, "Failed to set pin path to '%s': %m", fn); map = sym_bpf_object__next_map(obj->obj, map); } } r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX); if (r < 0) return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m"); r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int)); if (r < 0) return log_error_errno(r, "Failed to size userns ring buffer: %m"); /* Dummy map to satisfy the verifier */ dummy_mnt_id_hash_fd = make_inner_hash_map(); if (dummy_mnt_id_hash_fd < 0) return dummy_mnt_id_hash_fd; r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd); if (r < 0) return log_error_errno(r, "Failed to set inner BPF map: %m"); r = userns_restrict_bpf__load(obj); if (r < 0) return log_error_errno(r, "Failed to load BPF object: %m"); for (int i = 0; i < obj->skeleton->prog_cnt; i++) { _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; struct bpf_prog_skeleton *ps = obj->skeleton->progs + i; _cleanup_free_ char *fn = NULL; bool linked = false; const char *e; e = startswith(ps->name, "userns_restrict_"); assert(e); if (pin) { fn = path_join(PROGRAM_LINK_PREFIX, e); if (!fn) return log_oom(); link = sym_bpf_link__open(fn); r = bpf_get_error_translated(link); if (r < 0) { if (r != -ENOENT) return log_error_errno(r, "Unable to open pinned program link: %m"); link = NULL; } else { linked = true; log_info("userns-restrict BPF-LSM program %s already attached.", ps->name); } } if (!link) { link = sym_bpf_program__attach(*ps->prog); r = bpf_get_error_translated(link); if (r < 0) return log_error_errno(r, "Failed to attach LSM BPF program: %m"); log_info("userns-restrict BPF-LSM program %s now attached.", ps->name); } if (pin && !linked) { assert(fn); r = sym_bpf_link__pin(link, fn); if (r < 0) return log_error_errno(r, "Failed to pin LSM attachment: %m"); } *ps->link = TAKE_PTR(link); } if (pin) { r = sym_bpf_object__pin_maps(obj->obj, NULL); if (r < 0) return log_error_errno(r, "Failed to pin BPF maps: %m"); } if (ret) *ret = TAKE_PTR(obj); return 0; } int userns_restrict_put_by_inode( struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) { _cleanup_close_ int inner_map_fd = -EBADF; _cleanup_free_ int *mnt_ids = NULL; uint64_t ino = userns_inode; int r, outer_map_fd; assert(obj); assert(userns_inode != 0); assert(n_mount_fds == 0 || mount_fds); /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit * inode let's refuse early, we can't support this with the current BPF code for now. */ if (userns_inode > UINT32_MAX) return -EINVAL; mnt_ids = new(int, n_mount_fds); if (!mnt_ids) return -ENOMEM; for (size_t i = 0; i < n_mount_fds; i++) { r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i); if (r < 0) return log_debug_errno(r, "Failed to get mount ID: %m"); } outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash); if (outer_map_fd < 0) return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m"); if (replace) { /* Add if missing, replace if already exists */ inner_map_fd = make_inner_hash_map(); if (inner_map_fd < 0) return inner_map_fd; r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY); if (r < 0) return log_debug_errno(r, "Failed to replace map in inode hash: %m"); } else { /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We * might race against each other, hence we try a couple of times */ for (size_t n_try = 10;; n_try--) { uint32_t innermap_id; if (n_try == 0) return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), "Stillcan't create inode entry in BPF map after 10 tries."); r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id); if (r >= 0) { inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id); if (inner_map_fd < 0) return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m"); break; } if (errno != ENOENT) return log_debug_errno(errno, "Failed to look up inode hash entry: %m"); /* No entry for this user namespace yet. Let's create one */ inner_map_fd = make_inner_hash_map(); if (inner_map_fd < 0) return inner_map_fd; r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST); if (r >= 0) break; if (errno != EEXIST) return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m"); } } FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) { uint32_t dummy_value = 1; r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY); if (r < 0) return log_debug_errno(r, "Failed to add mount ID to map: %m"); log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino); } return 0; } int userns_restrict_put_by_fd( struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) { struct stat st; int r; assert(obj); assert(userns_fd >= 0); assert(n_mount_fds == 0 || mount_fds); r = fd_is_ns(userns_fd, CLONE_NEWUSER); if (r < 0) return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m"); if (r == 0) return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd."); if (fstat(userns_fd, &st) < 0) return log_debug_errno(errno, "Failed to fstat() user namespace: %m"); return userns_restrict_put_by_inode( obj, st.st_ino, replace, mount_fds, n_mount_fds); } int userns_restrict_reset_by_inode( struct userns_restrict_bpf *obj, uint64_t ino) { int r, outer_map_fd; unsigned u; assert(obj); assert(ino != 0); if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */ return 0; outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash); if (outer_map_fd < 0) return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m"); u = (uint32_t) ino; r = sym_bpf_map_delete_elem(outer_map_fd, &u); if (r < 0) return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino); return 0; } #else int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); } struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) { return NULL; } int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); } int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); } int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); } #endif