summaryrefslogtreecommitdiffstats
path: root/src/nsresourced/userns-restrict.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/nsresourced/userns-restrict.c346
1 files changed, 346 insertions, 0 deletions
diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c
new file mode 100644
index 0000000..be33f49
--- /dev/null
+++ b/src/nsresourced/userns-restrict.c
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "userns-restrict.h"
+
+#if HAVE_VMLINUX_H
+
+#include <sched.h>
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "lsm-util.h"
+#include "missing_mount.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+
+#define USERNS_MAX (16U*1024U)
+#define MOUNTS_MAX 4096U
+
+#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs"
+#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps"
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+ (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */
+ return NULL;
+}
+
+static int make_inner_hash_map(void) {
+ int fd;
+
+ fd = compat_bpf_map_create(
+ BPF_MAP_TYPE_HASH,
+ NULL,
+ sizeof(int),
+ sizeof(uint32_t),
+ MOUNTS_MAX,
+ NULL);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed allocate inner BPF map: %m");
+
+ return fd;
+}
+
+int userns_restrict_install(
+ bool pin,
+ struct userns_restrict_bpf **ret) {
+
+ _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
+ _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF;
+ int r;
+
+ r = lsm_supported("bpf");
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace.");
+
+ r = dlopen_bpf();
+ if (r < 0)
+ return r;
+
+ /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */
+ if (!sym_bpf_object__next_map)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace.");
+
+ obj = userns_restrict_bpf__open();
+ if (!obj)
+ return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m");
+
+ if (pin) {
+ struct bpf_map *map;
+
+ /* libbpf will only create one level of dirs. Let's create the rest */
+ (void) mkdir_p(MAP_LINK_PREFIX, 0755);
+ (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755);
+
+ map = sym_bpf_object__next_map(obj->obj, NULL);
+ while (map) {
+ _cleanup_free_ char *fn = NULL;
+
+ fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map));
+ if (!fn)
+ return log_oom();
+
+ r = sym_bpf_map__set_pin_path(map, fn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set pin path to '%s': %m", fn);
+
+ map = sym_bpf_object__next_map(obj->obj, map);
+ }
+ }
+
+ r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX);
+ if (r < 0)
+ return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m");
+
+ r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int));
+ if (r < 0)
+ return log_error_errno(r, "Failed to size userns ring buffer: %m");
+
+ /* Dummy map to satisfy the verifier */
+ dummy_mnt_id_hash_fd = make_inner_hash_map();
+ if (dummy_mnt_id_hash_fd < 0)
+ return dummy_mnt_id_hash_fd;
+
+ r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set inner BPF map: %m");
+
+ r = userns_restrict_bpf__load(obj);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load BPF object: %m");
+
+ for (int i = 0; i < obj->skeleton->prog_cnt; i++) {
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+ struct bpf_prog_skeleton *ps = obj->skeleton->progs + i;
+ _cleanup_free_ char *fn = NULL;
+ bool linked = false;
+ const char *e;
+
+ e = startswith(ps->name, "userns_restrict_");
+ assert(e);
+
+ if (pin) {
+ fn = path_join(PROGRAM_LINK_PREFIX, e);
+ if (!fn)
+ return log_oom();
+
+ link = sym_bpf_link__open(fn);
+ r = bpf_get_error_translated(link);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return log_error_errno(r, "Unable to open pinned program link: %m");
+ link = NULL;
+ } else {
+ linked = true;
+ log_info("userns-restrict BPF-LSM program %s already attached.", ps->name);
+ }
+ }
+
+ if (!link) {
+ link = sym_bpf_program__attach(*ps->prog);
+ r = bpf_get_error_translated(link);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach LSM BPF program: %m");
+
+ log_info("userns-restrict BPF-LSM program %s now attached.", ps->name);
+ }
+
+ if (pin && !linked) {
+ assert(fn);
+
+ r = sym_bpf_link__pin(link, fn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin LSM attachment: %m");
+ }
+
+ *ps->link = TAKE_PTR(link);
+ }
+
+ if (pin) {
+ r = sym_bpf_object__pin_maps(obj->obj, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin BPF maps: %m");
+ }
+
+ if (ret)
+ *ret = TAKE_PTR(obj);
+
+ return 0;
+}
+
+int userns_restrict_put_by_inode(
+ struct userns_restrict_bpf *obj,
+ uint64_t userns_inode,
+ bool replace,
+ const int mount_fds[],
+ size_t n_mount_fds) {
+
+ _cleanup_close_ int inner_map_fd = -EBADF;
+ _cleanup_free_ int *mnt_ids = NULL;
+ uint64_t ino = userns_inode;
+ int r, outer_map_fd;
+
+ assert(obj);
+ assert(userns_inode != 0);
+ assert(n_mount_fds == 0 || mount_fds);
+
+ /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode
+ * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit
+ * inode let's refuse early, we can't support this with the current BPF code for now. */
+ if (userns_inode > UINT32_MAX)
+ return -EINVAL;
+
+ mnt_ids = new(int, n_mount_fds);
+ if (!mnt_ids)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < n_mount_fds; i++) {
+ r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get mount ID: %m");
+ }
+
+ outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+ if (outer_map_fd < 0)
+ return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+ if (replace) {
+ /* Add if missing, replace if already exists */
+ inner_map_fd = make_inner_hash_map();
+ if (inner_map_fd < 0)
+ return inner_map_fd;
+
+ r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to replace map in inode hash: %m");
+ } else {
+ /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We
+ * might race against each other, hence we try a couple of times */
+ for (size_t n_try = 10;; n_try--) {
+ uint32_t innermap_id;
+
+ if (n_try == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
+ "Stillcan't create inode entry in BPF map after 10 tries.");
+
+ r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id);
+ if (r >= 0) {
+ inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id);
+ if (inner_map_fd < 0)
+ return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m");
+
+ break;
+ }
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to look up inode hash entry: %m");
+
+ /* No entry for this user namespace yet. Let's create one */
+ inner_map_fd = make_inner_hash_map();
+ if (inner_map_fd < 0)
+ return inner_map_fd;
+
+ r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST);
+ if (r >= 0)
+ break;
+ if (errno != EEXIST)
+ return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m");
+ }
+ }
+
+ FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) {
+ uint32_t dummy_value = 1;
+
+ r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add mount ID to map: %m");
+
+ log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino);
+ }
+
+ return 0;
+}
+
+int userns_restrict_put_by_fd(
+ struct userns_restrict_bpf *obj,
+ int userns_fd,
+ bool replace,
+ const int mount_fds[],
+ size_t n_mount_fds) {
+
+ struct stat st;
+ int r;
+
+ assert(obj);
+ assert(userns_fd >= 0);
+ assert(n_mount_fds == 0 || mount_fds);
+
+ r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+ if (fstat(userns_fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+ return userns_restrict_put_by_inode(
+ obj,
+ st.st_ino,
+ replace,
+ mount_fds,
+ n_mount_fds);
+}
+
+int userns_restrict_reset_by_inode(
+ struct userns_restrict_bpf *obj,
+ uint64_t ino) {
+
+ int r, outer_map_fd;
+ unsigned u;
+
+ assert(obj);
+ assert(ino != 0);
+
+ if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */
+ return 0;
+
+ outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+ if (outer_map_fd < 0)
+ return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+ u = (uint32_t) ino;
+
+ r = sym_bpf_map_delete_elem(outer_map_fd, &u);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino);
+
+ return 0;
+}
+
+#else
+int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+ return NULL;
+}
+
+int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+#endif