1 files changed, 346 insertions, 0 deletions
diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c
new file mode 100644
index 0000000..be33f49
--- /dev/null
+++ b/src/nsresourced/userns-restrict.c
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "userns-restrict.h"
+
+#if HAVE_VMLINUX_H
+
+#include <sched.h>
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "lsm-util.h"
+#include "missing_mount.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+
+#define USERNS_MAX (16U*1024U)
+#define MOUNTS_MAX 4096U
+
+#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs"
+#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps"
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+        (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */
+        return NULL;
+}
+
+static int make_inner_hash_map(void) {
+        int fd;
+
+        fd = compat_bpf_map_create(
+                        BPF_MAP_TYPE_HASH,
+                        NULL,
+                        sizeof(int),
+                        sizeof(uint32_t),
+                        MOUNTS_MAX,
+                        NULL);
+        if (fd < 0)
+                return log_debug_errno(errno, "Failed allocate inner BPF map: %m");
+
+        return fd;
+}
+
+int userns_restrict_install(
+                bool pin,
+                struct userns_restrict_bpf **ret) {
+
+        _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
+        _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF;
+        int r;
+
+        r = lsm_supported("bpf");
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace.");
+
+        r = dlopen_bpf();
+        if (r < 0)
+                return r;
+
+        /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */
+        if (!sym_bpf_object__next_map)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace.");
+
+        obj = userns_restrict_bpf__open();
+        if (!obj)
+                return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m");
+
+        if (pin) {
+                struct bpf_map *map;
+
+                /* libbpf will only create one level of dirs. Let's create the rest */
+                (void) mkdir_p(MAP_LINK_PREFIX, 0755);
+                (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755);
+
+                map = sym_bpf_object__next_map(obj->obj, NULL);
+                while (map) {
+                        _cleanup_free_ char *fn = NULL;
+
+                        fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map));
+                        if (!fn)
+                                return log_oom();
+
+                        r = sym_bpf_map__set_pin_path(map, fn);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set pin path to '%s': %m", fn);
+
+                        map = sym_bpf_object__next_map(obj->obj, map);
+                }
+        }
+
+        r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX);
+        if (r < 0)
+                return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m");
+
+        r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int));
+        if (r < 0)
+                return log_error_errno(r, "Failed to size userns ring buffer: %m");
+
+        /* Dummy map to satisfy the verifier */
+        dummy_mnt_id_hash_fd = make_inner_hash_map();
+        if (dummy_mnt_id_hash_fd < 0)
+                return dummy_mnt_id_hash_fd;
+
+        r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set inner BPF map: %m");
+
+        r = userns_restrict_bpf__load(obj);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load BPF object: %m");
+
+        for (int i = 0; i < obj->skeleton->prog_cnt; i++) {
+                _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+                struct bpf_prog_skeleton *ps = obj->skeleton->progs + i;
+                _cleanup_free_ char *fn = NULL;
+                bool linked = false;
+                const char *e;
+
+                e = startswith(ps->name, "userns_restrict_");
+                assert(e);
+
+                if (pin) {
+                        fn = path_join(PROGRAM_LINK_PREFIX, e);
+                        if (!fn)
+                                return log_oom();
+
+                        link = sym_bpf_link__open(fn);
+                        r = bpf_get_error_translated(link);
+                        if (r < 0) {
+                                if (r != -ENOENT)
+                                        return log_error_errno(r, "Unable to open pinned program link: %m");
+                                link = NULL;
+                        } else {
+                                linked = true;
+                                log_info("userns-restrict BPF-LSM program %s already attached.", ps->name);
+                        }
+                }
+
+                if (!link) {
+                        link = sym_bpf_program__attach(*ps->prog);
+                        r = bpf_get_error_translated(link);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to attach LSM BPF program: %m");
+
+                        log_info("userns-restrict BPF-LSM program %s now attached.", ps->name);
+                }
+
+                if (pin && !linked) {
+                        assert(fn);
+
+                        r = sym_bpf_link__pin(link, fn);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to pin LSM attachment: %m");
+                }
+
+                *ps->link = TAKE_PTR(link);
+        }
+
+        if (pin) {
+                r = sym_bpf_object__pin_maps(obj->obj, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to pin BPF maps: %m");
+        }
+
+        if (ret)
+                *ret = TAKE_PTR(obj);
+
+        return 0;
+}
+
+int userns_restrict_put_by_inode(
+                struct userns_restrict_bpf *obj,
+                uint64_t userns_inode,
+                bool replace,
+                const int mount_fds[],
+                size_t n_mount_fds) {
+
+        _cleanup_close_ int inner_map_fd = -EBADF;
+        _cleanup_free_ int *mnt_ids = NULL;
+        uint64_t ino = userns_inode;
+        int r, outer_map_fd;
+
+        assert(obj);
+        assert(userns_inode != 0);
+        assert(n_mount_fds == 0 || mount_fds);
+
+        /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode
+         * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit
+         * inode let's refuse early, we can't support this with the current BPF code for now. */
+        if (userns_inode > UINT32_MAX)
+                return -EINVAL;
+
+        mnt_ids = new(int, n_mount_fds);
+        if (!mnt_ids)
+                return -ENOMEM;
+
+        for (size_t i = 0; i < n_mount_fds; i++) {
+                r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to get mount ID: %m");
+        }
+
+        outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+        if (outer_map_fd < 0)
+                return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+        if (replace) {
+                /* Add if missing, replace if already exists */
+                inner_map_fd = make_inner_hash_map();
+                if (inner_map_fd < 0)
+                        return inner_map_fd;
+
+                r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to replace map in inode hash: %m");
+        } else {
+                /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We
+                 * might race against each other, hence we try a couple of times */
+                for (size_t n_try = 10;; n_try--) {
+                        uint32_t innermap_id;
+
+                        if (n_try == 0)
+                                return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
+                                                       "Stillcan't create inode entry in BPF map after 10 tries.");
+
+                        r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id);
+                        if (r >= 0) {
+                                inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id);
+                                if (inner_map_fd < 0)
+                                        return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m");
+
+                                break;
+                        }
+                        if (errno != ENOENT)
+                                return log_debug_errno(errno, "Failed to look up inode hash entry: %m");
+
+                        /* No entry for this user namespace yet. Let's create one */
+                        inner_map_fd = make_inner_hash_map();
+                        if (inner_map_fd < 0)
+                                return inner_map_fd;
+
+                        r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST);
+                        if (r >= 0)
+                                break;
+                        if (errno != EEXIST)
+                                return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m");
+                }
+        }
+
+        FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) {
+                uint32_t dummy_value = 1;
+
+                r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to add mount ID to map: %m");
+
+                log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino);
+        }
+
+        return 0;
+}
+
+int userns_restrict_put_by_fd(
+                struct userns_restrict_bpf *obj,
+                int userns_fd,
+                bool replace,
+                const int mount_fds[],
+                size_t n_mount_fds) {
+
+        struct stat st;
+        int r;
+
+        assert(obj);
+        assert(userns_fd >= 0);
+        assert(n_mount_fds == 0 || mount_fds);
+
+        r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+        if (r == 0)
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+        if (fstat(userns_fd, &st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+        return userns_restrict_put_by_inode(
+                        obj,
+                        st.st_ino,
+                        replace,
+                        mount_fds,
+                        n_mount_fds);
+}
+
+int userns_restrict_reset_by_inode(
+                struct userns_restrict_bpf *obj,
+                uint64_t ino) {
+
+        int r, outer_map_fd;
+        unsigned u;
+
+        assert(obj);
+        assert(ino != 0);
+
+        if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */
+                return 0;
+
+        outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+        if (outer_map_fd < 0)
+                return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+        u = (uint32_t) ino;
+
+        r = sym_bpf_map_delete_elem(outer_map_fd, &u);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino);
+
+        return 0;
+}
+
+#else
+int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+        return NULL;
+}
+
+int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+#endif