summaryrefslogtreecommitdiffstats
path: root/src/nsresourced
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/nsresourced/bpf/userns_restrict/meson.build25
-rw-r--r--src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h17
-rw-r--r--src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c179
-rw-r--r--src/nsresourced/meson.build48
-rw-r--r--src/nsresourced/nsresourced-manager.c647
-rw-r--r--src/nsresourced/nsresourced-manager.h40
-rw-r--r--src/nsresourced/nsresourced.c46
-rw-r--r--src/nsresourced/nsresourcework.c1782
-rw-r--r--src/nsresourced/test-userns-restrict.c182
-rw-r--r--src/nsresourced/userns-registry.c646
-rw-r--r--src/nsresourced/userns-registry.h42
-rw-r--r--src/nsresourced/userns-restrict.c346
-rw-r--r--src/nsresourced/userns-restrict.h22
13 files changed, 4022 insertions, 0 deletions
diff --git a/src/nsresourced/bpf/userns_restrict/meson.build b/src/nsresourced/bpf/userns_restrict/meson.build
new file mode 100644
index 0000000..d773c75
--- /dev/null
+++ b/src/nsresourced/bpf/userns_restrict/meson.build
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('HAVE_VMLINUX_H') != 1
+ subdir_done()
+endif
+
+userns_restrict_bpf_o_unstripped = custom_target(
+ 'userns-restrict.bpf.unstripped.o',
+ input : 'userns-restrict.bpf.c',
+ output : 'userns-restrict.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd,
+ depends : vmlinux_h_dependency)
+
+userns_restrict_bpf_o = custom_target(
+ 'userns-restrict.bpf.o',
+ input : userns_restrict_bpf_o_unstripped,
+ output : 'userns-restrict.bpf.o',
+ command : bpf_o_cmd)
+
+userns_restrict_skel_h = custom_target(
+ 'userns-restrict.skel.h',
+ input : userns_restrict_bpf_o,
+ output : 'userns-restrict.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h
new file mode 100644
index 0000000..271caf4
--- /dev/null
+++ b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "bpf-dlopen.h"
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__attach_skeleton sym_bpf_object__attach_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+
+#include "bpf/userns_restrict/userns-restrict.skel.h"
diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c
new file mode 100644
index 0000000..126422b
--- /dev/null
+++ b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the
+ * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define
+ * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0
+ * it does not.) */
+#undef offsetof
+#undef container_of
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <errno.h>
+
+#ifndef bpf_core_cast
+/* bpf_rdonly_cast() was introduced in libbpf commit 688879f together with
+ * the definition of a bpf_core_cast macro. So use that one to avoid
+ * defining a prototype for bpf_rdonly_cast */
+void *bpf_rdonly_cast(void *, __u32) __ksym;
+#endif
+
+/* BPF module that implements an allowlist of mounts (identified by mount ID) for user namespaces (identified
+ * by their inode number in nsfs) that restricts creation of inodes (which would inherit the callers UID/GID)
+ * or changing of ownership (similar).
+ *
+ * This hooks into the various path-based LSM entrypoints that control inode creation as well as chmod(), and
+ * then looks up the calling process' user namespace in a global map of namespaces, which points us to
+ * another map that is simply a list of allowed mnt_ids. */
+
+// FIXME: ACL adjustments are currently not blocked. There's no path-based LSM hook available in the kernel
+// for setting xattrs or ACLs, hence we cannot easily block them, even though we want that. We can get away
+// with ignoring this for now, as ACLs never define ownership, but purely access: i.e. ACLs never allow
+// taking possession of an object, but only control access to it. Thus, things like suid access modes should
+// not be reachable through it. It still sucks though that a user can persistently add an ACL entry to a file
+// with their transient UIDs/GIDs.
+
+/* kernel currently enforces a maximum usernamespace nesting depth of 32, see create_user_ns() in the kernel sources */
+#define USER_NAMESPACE_DEPTH_MAX 32U
+
+struct mnt_id_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */
+ __type(key, int);
+ __type(value, int);
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */
+ __type(key, unsigned); /* userns inode */
+ __array(values, struct mnt_id_map);
+} userns_mnt_id_hash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 4096);
+} userns_ringbuf SEC(".maps");
+
+static inline struct mount *real_mount(struct vfsmount *mnt) {
+ return container_of(mnt, struct mount, mnt);
+}
+
+static int validate_inode_on_mount(struct inode *inode, struct vfsmount *v) {
+ struct user_namespace *mount_userns, *task_userns, *p;
+ unsigned task_userns_inode;
+ struct task_struct *task;
+ void *mnt_id_map;
+ struct mount *m;
+ int mnt_id;
+
+ /* Get user namespace from vfsmount */
+ m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
+ mount_userns = m->mnt_ns->user_ns;
+
+ /* Get user namespace from task */
+ task = (struct task_struct*) bpf_get_current_task_btf();
+ task_userns = task->cred->user_ns;
+
+ /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+ * yes immediately. */
+ p = mount_userns;
+ for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
+ if (p == task_userns)
+ return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */
+
+ p = p->parent;
+ if (!p)
+ break;
+ }
+
+ /* Hmm, something is fishy if there's more than 32 levels of namespaces involved. Let's better be
+ * safe than sorry, and refuse. */
+ if (p)
+ return -EPERM;
+
+ /* This is a mount foreign to our task's user namespace, let's consult our allow list */
+ task_userns_inode = task_userns->ns.inum;
+
+ mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode);
+ if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
+ return 0;
+
+ mnt_id = m->mnt_id;
+
+ /* Otherwise, say yes if the mount ID is allowlisted */
+ if (bpf_map_lookup_elem(mnt_id_map, &mnt_id))
+ return 0;
+
+ return -EPERM;
+}
+
+static int validate_path(const struct path *path, int ret) {
+ struct inode *inode;
+ struct vfsmount *v;
+
+ if (ret != 0) /* propagate earlier error */
+ return ret;
+
+ inode = path->dentry->d_inode;
+ v = path->mnt;
+
+ return validate_inode_on_mount(inode, v);
+}
+
+SEC("lsm/path_chown")
+int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) {
+ return validate_path(path, ret);
+}
+
+SEC("lsm/path_mkdir")
+int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) {
+ return validate_path(dir, ret);
+}
+
+SEC("lsm/path_mknod")
+int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev, int ret) {
+ return validate_path(dir, ret);
+}
+
+SEC("lsm/path_symlink")
+int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) {
+ return validate_path(dir, ret);
+}
+
+SEC("lsm/path_link")
+int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) {
+ return validate_path(new_dir, ret);
+}
+
+SEC("kprobe/free_user_ns")
+void BPF_KPROBE(userns_restrict_free_user_ns, struct work_struct *work) {
+ struct user_namespace *userns;
+ unsigned inode;
+ void *mnt_id_map;
+
+ /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into
+ * user namespaces being deleted than using kprobes, but couldn't find any. */
+
+ userns = bpf_rdonly_cast(container_of(work, struct user_namespace, work),
+ bpf_core_type_id_kernel(struct user_namespace));
+
+ inode = userns->ns.inum;
+
+ mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode);
+ if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */
+ return;
+
+ bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0);
+}
+
+static const char _license[] SEC("license") = "GPL";
diff --git a/src/nsresourced/meson.build b/src/nsresourced/meson.build
new file mode 100644
index 0000000..cb131f0
--- /dev/null
+++ b/src/nsresourced/meson.build
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+subdir('bpf/userns_restrict')
+
+systemd_nsresourcework_sources = files(
+ 'nsresourcework.c',
+ 'userns-restrict.c',
+ 'userns-registry.c',
+)
+
+systemd_nsresourced_sources = files(
+ 'nsresourced-manager.c',
+ 'nsresourced.c',
+ 'userns-restrict.c',
+ 'userns-registry.c',
+)
+
+userns_restrict_include = include_directories('.')
+
+if conf.get('HAVE_VMLINUX_H') == 1
+ systemd_nsresourcework_sources += userns_restrict_skel_h
+ systemd_nsresourced_sources += userns_restrict_skel_h
+
+ executables += [
+ test_template + {
+ 'sources' : files('test-userns-restrict.c', 'userns-restrict.c') + userns_restrict_skel_h,
+ 'conditions' : ['ENABLE_NSRESOURCED', 'HAVE_VMLINUX_H'],
+ 'include_directories' : [ includes, userns_restrict_include ],
+ },
+ ]
+endif
+
+executables += [
+ libexec_template + {
+ 'name' : 'systemd-nsresourcework',
+ 'conditions' : ['ENABLE_NSRESOURCED'],
+ 'sources' : systemd_nsresourcework_sources,
+ 'dependencies' : threads,
+ 'include_directories' : [ includes, userns_restrict_include ],
+ },
+ libexec_template + {
+ 'name' : 'systemd-nsresourced',
+ 'conditions' : ['ENABLE_NSRESOURCED'],
+ 'sources' : systemd_nsresourced_sources,
+ 'dependencies' : threads,
+ 'include_directories' : [ includes, userns_restrict_include ],
+ },
+]
diff --git a/src/nsresourced/nsresourced-manager.c b/src/nsresourced/nsresourced-manager.c
new file mode 100644
index 0000000..d87da58
--- /dev/null
+++ b/src/nsresourced/nsresourced-manager.c
@@ -0,0 +1,647 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+#include <sys/wait.h>
+
+#include "sd-daemon.h"
+
+#include "bpf-dlopen.h"
+#include "build-path.h"
+#include "common-signal.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "mkdir.h"
+#include "nsresourced-manager.h"
+#include "parse-util.h"
+#include "process-util.h"
+#include "recurse-dir.h"
+#include "set.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "umask-util.h"
+#include "unaligned.h"
+#include "user-util.h"
+#include "userns-registry.h"
+#include "userns-restrict.h"
+
+#define LISTEN_TIMEOUT_USEC (25 * USEC_PER_SEC)
+
+static int start_workers(Manager *m, bool explicit_request);
+
+static int on_worker_exit(sd_event_source *s, const siginfo_t *si, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ assert_se(!set_remove(m->workers_dynamic, s) != !set_remove(m->workers_fixed, s));
+ sd_event_source_disable_unref(s);
+
+ if (si->si_code == CLD_EXITED) {
+ if (si->si_status == EXIT_SUCCESS)
+ log_debug("Worker " PID_FMT " exited successfully.", si->si_pid);
+ else
+ log_warning("Worker " PID_FMT " died with a failure exit status %i, ignoring.", si->si_pid, si->si_status);
+ } else if (si->si_code == CLD_KILLED)
+ log_warning("Worker " PID_FMT " was killed by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status));
+ else if (si->si_code == CLD_DUMPED)
+ log_warning("Worker " PID_FMT " dumped core by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status));
+ else
+ log_warning("Got unexpected exit code via SIGCHLD, ignoring.");
+
+ (void) start_workers(m, /* explicit_request= */ false); /* Fill up workers again if we fell below the low watermark */
+ return 0;
+}
+
+static int on_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ (void) start_workers(m, /* explicit_request=*/ true); /* Workers told us there's more work, let's add one more worker as long as we are below the high watermark */
+ return 0;
+}
+
+static int on_deferred_start_worker(sd_event_source *s, uint64_t usec, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source);
+
+ (void) start_workers(m, /* explicit_request=*/ false);
+ return 0;
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(
+ event_source_hash_ops,
+ sd_event_source,
+ (void (*)(const sd_event_source*, struct siphash*)) trivial_hash_func,
+ (int (*)(const sd_event_source*, const sd_event_source*)) trivial_compare_func,
+ sd_event_source_disable_unref);
+
+int manager_new(Manager **ret) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ m = new(Manager, 1);
+ if (!m)
+ return -ENOMEM;
+
+ *m = (Manager) {
+ .listen_fd = -EBADF,
+ .worker_ratelimit = {
+ .interval = 2 * USEC_PER_SEC,
+ .burst = 250,
+ },
+ .registry_fd = -EBADF,
+ };
+
+ r = sd_event_new(&m->event);
+ if (r < 0)
+ return r;
+
+ r = sd_event_set_signal_exit(m->event, true);
+ if (r < 0)
+ return r;
+
+ r = sd_event_add_signal(m->event, NULL, (SIGRTMIN+18)|SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL);
+ if (r < 0)
+ return r;
+
+ r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
+
+ r = sd_event_set_watchdog(m->event, true);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable watchdog handling, ignoring: %m");
+
+ r = sd_event_add_signal(m->event, NULL, SIGUSR2|SD_EVENT_SIGNAL_PROCMASK, on_sigusr2, m);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+Manager* manager_free(Manager *m) {
+ if (!m)
+ return NULL;
+
+ set_free(m->workers_fixed);
+ set_free(m->workers_dynamic);
+
+ m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source);
+
+ safe_close(m->listen_fd);
+
+#if HAVE_VMLINUX_H
+ sd_event_source_disable_unref(m->userns_restrict_bpf_ring_buffer_event_source);
+ if (m->userns_restrict_bpf_ring_buffer)
+ sym_ring_buffer__free(m->userns_restrict_bpf_ring_buffer);
+ userns_restrict_bpf_free(m->userns_restrict_bpf);
+#endif
+
+ safe_close(m->registry_fd);
+
+ sd_event_unref(m->event);
+
+ return mfree(m);
+}
+
+static size_t manager_current_workers(Manager *m) {
+ assert(m);
+
+ return set_size(m->workers_fixed) + set_size(m->workers_dynamic);
+}
+
+static int start_one_worker(Manager *m) {
+ _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source = NULL;
+ bool fixed;
+ pid_t pid;
+ int r;
+
+ assert(m);
+
+ fixed = set_size(m->workers_fixed) < NSRESOURCE_WORKERS_MIN;
+
+ r = safe_fork_full(
+ "(sd-worker)",
+ /* stdio_fds= */ NULL,
+ &m->listen_fd, 1,
+ FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_CLOSE_ALL_FDS,
+ &pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to fork new worker child: %m");
+ if (r == 0) {
+ char pids[DECIMAL_STR_MAX(pid_t)];
+ /* Child */
+
+ if (m->listen_fd == 3) {
+ r = fd_cloexec(3, false);
+ if (r < 0) {
+ log_error_errno(r, "Failed to turn off O_CLOEXEC for fd 3: %m");
+ _exit(EXIT_FAILURE);
+ }
+ } else {
+ if (dup2(m->listen_fd, 3) < 0) { /* dup2() creates with O_CLOEXEC off */
+ log_error_errno(errno, "Failed to move listen fd to 3: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ safe_close(m->listen_fd);
+ }
+
+ xsprintf(pids, PID_FMT, pid);
+ if (setenv("LISTEN_PID", pids, 1) < 0) {
+ log_error_errno(errno, "Failed to set $LISTEN_PID: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (setenv("LISTEN_FDS", "1", 1) < 0) {
+ log_error_errno(errno, "Failed to set $LISTEN_FDS: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (setenv("NSRESOURCE_FIXED_WORKER", one_zero(fixed), 1) < 0) {
+ log_error_errno(errno, "Failed to set $NSRESOURCE_FIXED_WORKER: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+#if HAVE_VMLINUX_H
+ bool supported = m->userns_restrict_bpf;
+#else
+ bool supported = false;
+#endif
+
+ /* Tell the workers whether to enable the userns API */
+ if (setenv("NSRESOURCE_API", one_zero(supported), 1) < 0) {
+ log_error_errno(errno, "Failed to set $NSRESOURCE_API: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = setenv_systemd_log_level();
+ if (r < 0) {
+ log_error_errno(r, "Failed to set $SYSTEMD_LOG_LEVEL: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = invoke_callout_binary(SYSTEMD_NSRESOURCEWORK_PATH, STRV_MAKE("systemd-nsresourcework", "xxxxxxxxxxxxxxxx")); /* With some extra space rename_process() can make use of */
+ log_error_errno(r, "Failed start worker process: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = sd_event_add_child(m->event, &source, pid, WEXITED, on_worker_exit, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch child " PID_FMT ": %m", pid);
+
+ r = set_ensure_put(
+ fixed ? &m->workers_fixed : &m->workers_dynamic,
+ &event_source_hash_ops,
+ source);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add child process to set: %m");
+
+ TAKE_PTR(source);
+
+ return 0;
+}
+
+static int start_workers(Manager *m, bool explicit_request) {
+ int r;
+
+ assert(m);
+
+ for (;;) {
+ size_t n;
+
+ n = manager_current_workers(m);
+ if (n >= NSRESOURCE_WORKERS_MIN && (!explicit_request || n >= NSRESOURCE_WORKERS_MAX))
+ break;
+
+ if (!ratelimit_below(&m->worker_ratelimit)) {
+
+ /* If we keep starting workers too often but none sticks, let's fail the whole
+ * daemon, something is wrong */
+ if (n == 0) {
+ sd_event_exit(m->event, EXIT_FAILURE);
+ return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "Worker threads requested too frequently, but worker count is zero, something is wrong.");
+ }
+
+ /* Otherwise, let's stop spawning more for a while. */
+ log_warning("Worker threads requested too frequently, not starting new ones for a while.");
+
+ if (!m->deferred_start_worker_event_source) {
+ r = sd_event_add_time(
+ m->event,
+ &m->deferred_start_worker_event_source,
+ CLOCK_MONOTONIC,
+ ratelimit_end(&m->worker_ratelimit),
+ /* accuracy_usec= */ 0,
+ on_deferred_start_worker,
+ m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate deferred start worker event source: %m");
+ }
+
+ break;
+ }
+
+ r = start_one_worker(m);
+ if (r < 0)
+ return r;
+
+ explicit_request = false;
+ }
+
+ return 0;
+}
+
+static void manager_release_userns_bpf(Manager *m, uint64_t inode) {
+#if HAVE_VMLINUX_H
+ int r;
+
+ assert(m);
+
+ if (inode == 0)
+ return;
+
+ assert(m->userns_restrict_bpf);
+
+ r = userns_restrict_reset_by_inode(m->userns_restrict_bpf, inode);
+ if (r < 0)
+ return (void) log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m");
+#endif
+}
+
+static void manager_release_userns_fds(Manager *m, uint64_t inode) {
+ int r;
+
+ assert(m);
+ assert(inode != 0);
+
+ r = sd_notifyf(/* unset_environment= */ false,
+ "FDSTOREREMOVE=1\n"
+ "FDNAME=userns-%" PRIu64 "\n", inode);
+ if (r < 0)
+ log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m");
+}
+
+static void manager_release_userns_by_inode(Manager *m, uint64_t inode) {
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_close_ int lock_fd = -EBADF;
+ int r;
+
+ assert(m);
+ assert(inode != 0);
+
+ lock_fd = userns_registry_lock(m->registry_fd);
+ if (lock_fd < 0)
+ return (void) log_error_errno(lock_fd, "Failed to lock registry: %m");
+
+ r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info);
+ if (r < 0)
+ log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode);
+
+ if (userns_info && uid_is_valid(userns_info->start))
+ log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", inode, userns_info->start);
+ else
+ log_debug("Removing user namespace mapping %" PRIu64 ".", inode);
+
+ /* Remove the BPF rules */
+ manager_release_userns_bpf(m, inode);
+
+ /* Remove the resources from the fdstore */
+ manager_release_userns_fds(m, inode);
+
+ /* And finally remove the resources file from disk */
+ if (userns_info) {
+ /* Remove the cgroups of this userns */
+ r = userns_info_remove_cgroups(userns_info);
+ if (r < 0)
+ log_warning_errno(r, "Failed to remove cgroups of user namespace: %m");
+
+ r = userns_registry_remove(m->registry_fd, userns_info);
+ if (r < 0)
+ log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", userns_info->name);
+ }
+}
+
+static int manager_scan_registry(Manager *m, Set **registry_inodes) {
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ int r;
+
+ assert(m);
+ assert(registry_inodes);
+ assert(m->registry_fd >= 0);
+
+ r = readdir_all(m->registry_fd, RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enumerate registry.");
+
+ for (size_t i = 0; i < de->n_entries; i++) {
+ struct dirent *dentry = de->entries[i];
+ _cleanup_free_ char *u = NULL;
+ const char *e, *p;
+ uint64_t inode;
+
+ p = startswith(dentry->d_name, "i");
+ if (!p)
+ continue;
+
+ e = endswith(p, ".userns");
+ if (!e)
+ continue;
+
+ u = strndup(p, e - p);
+ if (!u)
+ return log_oom();
+
+ r = safe_atou64(u, &inode);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse userns inode number from '%s', skipping: %m", dentry->d_name);
+ continue;
+ }
+
+ if (inode > UINT32_MAX) { /* namespace inode numbers are 23bit only right now */
+ log_warning("userns inode number outside of 32bit range, skipping.");
+ continue;
+ }
+
+ if (set_ensure_put(registry_inodes, NULL, UINT32_TO_PTR(inode)) < 0)
+ return log_oom();
+
+ log_debug("Found user namespace %" PRIu64 " in registry directory", inode);
+ }
+
+ return 0;
+}
+
+static int manager_make_listen_socket(Manager *m) {
+ static const union sockaddr_union sockaddr = {
+ .un.sun_family = AF_UNIX,
+ .un.sun_path = "/run/systemd/io.systemd.NamespaceResource",
+ };
+ int r;
+
+ assert(m);
+
+ if (m->listen_fd >= 0)
+ return 0;
+
+ m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
+ if (m->listen_fd < 0)
+ return log_error_errno(errno, "Failed to bind on socket: %m");
+
+ (void) sockaddr_un_unlink(&sockaddr.un);
+
+ WITH_UMASK(0000)
+ if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0)
+ return log_error_errno(errno, "Failed to bind socket: %m");
+
+ r = mkdir_p("/run/systemd/userdb", 0755);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /run/systemd/userdb: %m");
+
+ r = symlink_idempotent("../io.systemd.NamespaceResource", "/run/systemd/userdb/io.systemd.NamespaceResource", /* make_relative= */ false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to symlink userdb socket: %m");
+
+ if (listen(m->listen_fd, SOMAXCONN) < 0)
+ return log_error_errno(errno, "Failed to listen on socket: %m");
+
+ return 1;
+}
+
+static int manager_scan_listen_fds(Manager *m, Set **fdstore_inodes) {
+ _cleanup_strv_free_ char **names = NULL;
+ int n, r;
+
+ assert(m);
+ assert(fdstore_inodes);
+
+ n = sd_listen_fds_with_names(/* unset_environment= */ true, &names);
+ if (n < 0)
+ return log_error_errno(n, "Failed to determine number of passed file descriptors: %m");
+
+ for (int i = 0; i < n; i++) {
+ _cleanup_close_ int fd = SD_LISTEN_FDS_START + i; /* Take possession */
+ const char *e;
+
+ /* If this is a BPF allowlist related fd, just close it, but remember which start UIDs this covers */
+ e = startswith(names[i], "userns-");
+ if (e) {
+ uint64_t inode;
+
+ r = safe_atou64(e, &inode);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse UID from fd name '%s', ignoring: %m", e);
+ continue;
+ }
+
+ if (inode > UINT32_MAX) {
+ log_warning("Inode number outside of 32bit range, ignoring");
+ continue;
+ }
+
+ if (set_ensure_put(fdstore_inodes, NULL, UINT32_TO_PTR(inode)) < 0)
+ return log_oom();
+
+ continue;
+ }
+
+ /* We don't check the name for the stream socket, for compatibility with older versions */
+ r = sd_is_socket(fd, AF_UNIX, SOCK_STREAM, 1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to detect if passed file descriptor is a socket: %m");
+ if (r > 0) {
+ if (m->listen_fd >= 0)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Passed more than one AF_UNIX/SOCK_STREAM socket, refusing.");
+
+ m->listen_fd = TAKE_FD(fd);
+ continue;
+ }
+
+ log_warning("Closing passed file descriptor %i (%s) we don't recognize.", fd, names[i]);
+ }
+
+ return 0;
+}
+
+#if HAVE_VMLINUX_H
+static int ringbuf_event(void *userdata, void *data, size_t size) {
+ Manager *m = ASSERT_PTR(userdata);
+ size_t n;
+
+ if ((size % sizeof(unsigned int)) != 0) /* Not multiples of "unsigned int"? */
+ return -EIO;
+
+ n = size / sizeof(unsigned int);
+ for (size_t i = 0; i < n; i++) {
+ const void *d;
+ uint64_t inode;
+
+ d = (const uint8_t*) data + i * sizeof(unsigned int);
+ inode = unaligned_read_ne32(d);
+
+ log_debug("Got BPF ring buffer notification that user namespace %" PRIu64 " is now dead.", inode);
+ manager_release_userns_by_inode(m, inode);
+ }
+
+ return 0;
+}
+
+static int on_ringbuf_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ r = sym_ring_buffer__poll(m->userns_restrict_bpf_ring_buffer, 0);
+ if (r < 0)
+ return log_error_errno(r, "Got failure reading from BPF ring buffer: %m");
+
+ return 0;
+}
+
+static int manager_setup_bpf(Manager *m) {
+ int rb_fd = -EBADF, poll_fd = -EBADF, r;
+
+ assert(m);
+ assert(!m->userns_restrict_bpf);
+ assert(!m->userns_restrict_bpf_ring_buffer);
+ assert(!m->userns_restrict_bpf_ring_buffer_event_source);
+
+ r = userns_restrict_install(/* pin= */ true, &m->userns_restrict_bpf);
+ if (r < 0) {
+ log_notice_errno(r, "Proceeding with user namespace interfaces disabled.");
+ return 0;
+ }
+
+ rb_fd = sym_bpf_map__fd(m->userns_restrict_bpf->maps.userns_ringbuf);
+ if (rb_fd < 0)
+ return log_error_errno(rb_fd, "Failed to get fd of ring buffer: %m");
+
+ m->userns_restrict_bpf_ring_buffer = sym_ring_buffer__new(rb_fd, ringbuf_event, m, NULL);
+ if (!m->userns_restrict_bpf_ring_buffer)
+ return log_error_errno(errno, "Failed to allocate BPF ring buffer object: %m");
+
+ poll_fd = sym_ring_buffer__epoll_fd(m->userns_restrict_bpf_ring_buffer);
+ if (poll_fd < 0)
+ return log_error_errno(poll_fd, "Failed to get poll fd of ring buffer: %m");
+
+ r = sd_event_add_io(
+ m->event,
+ &m->userns_restrict_bpf_ring_buffer_event_source,
+ poll_fd,
+ EPOLLIN,
+ on_ringbuf_io,
+ m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate event source for BPF ring buffer: %m");
+
+ return 0;
+}
+#else
+static int manager_setup_bpf(Manager *m) {
+ log_notice("Not setting up BPF subsystem, as functionality has been disabled at compile time.");
+ return 0;
+}
+#endif
+
+int manager_startup(Manager *m) {
+ _cleanup_(set_freep) Set *fdstore_inodes = NULL, *registry_inodes = NULL;
+ void *p;
+ int r;
+
+ assert(m);
+ assert(m->registry_fd < 0);
+ assert(m->listen_fd < 0);
+
+ m->registry_fd = userns_registry_open_fd();
+ if (m->registry_fd < 0)
+ return log_error_errno(m->registry_fd, "Failed to open registry directory: %m");
+
+ r = manager_setup_bpf(m);
+ if (r < 0)
+ return r;
+
+ r = manager_scan_listen_fds(m, &fdstore_inodes);
+ if (r < 0)
+ return r;
+
+ r = manager_scan_registry(m, &registry_inodes);
+ if (r < 0)
+ return r;
+
+ /* If there are resources tied to UIDs not found in the registry, then release them */
+ SET_FOREACH(p, fdstore_inodes) {
+ uint64_t inode;
+
+ if (set_contains(registry_inodes, p))
+ continue;
+
+ inode = PTR_TO_UINT32(p);
+
+ log_debug("Found stale fd store entry for user namespace %" PRIu64 ", removing.", inode);
+ manager_release_userns_by_inode(m, inode);
+ }
+
+ r = manager_make_listen_socket(m);
+ if (r < 0)
+ return r;
+
+ /* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be
+ * GC'ed on idle */
+ if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0)
+ return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m");
+
+ r = start_workers(m, /* explicit_request= */ false);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
diff --git a/src/nsresourced/nsresourced-manager.h b/src/nsresourced/nsresourced-manager.h
new file mode 100644
index 0000000..5ecf378
--- /dev/null
+++ b/src/nsresourced/nsresourced-manager.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-event.h"
+
+typedef struct Manager Manager;
+
+#include "hashmap.h"
+#include "ratelimit.h"
+
+#define NSRESOURCE_WORKERS_MIN 5
+#define NSRESOURCE_WORKERS_MAX 4096
+
+struct Manager {
+ sd_event *event;
+
+ Set *workers_fixed; /* Workers 0…NSRESOURCE_WORKERS_MIN */
+ Set *workers_dynamic; /* Workers NSRESOURCES_WORKERS_MIN+1…NSRESOURCES_WORKERS_MAX */
+
+ int listen_fd;
+
+ RateLimit worker_ratelimit;
+
+ sd_event_source *deferred_start_worker_event_source;
+
+#if HAVE_VMLINUX_H
+ struct userns_restrict_bpf *userns_restrict_bpf;
+ struct ring_buffer *userns_restrict_bpf_ring_buffer;
+ sd_event_source *userns_restrict_bpf_ring_buffer_event_source;
+#endif
+
+ int registry_fd;
+};
+
+int manager_new(Manager **ret);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m);
diff --git a/src/nsresourced/nsresourced.c b/src/nsresourced/nsresourced.c
new file mode 100644
index 0000000..7056897
--- /dev/null
+++ b/src/nsresourced/nsresourced.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "daemon-util.h"
+#include "nsresourced-manager.h"
+#include "log.h"
+#include "main-func.h"
+#include "signal-util.h"
+
+static int run(int argc, char *argv[]) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ log_setup();
+
+ umask(0022);
+
+ if (argc != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments.");
+
+ if (setenv("SYSTEMD_BYPASS_USERDB", "io.systemd.NamespaceResource", 1) < 0)
+ return log_error_errno(errno, "Failed to set $SYSTEMD_BYPASS_USERDB: %m");
+
+ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD) >= 0);
+
+ r = manager_new(&m);
+ if (r < 0)
+ return log_error_errno(r, "Could not create manager: %m");
+
+ r = manager_startup(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to start up daemon: %m");
+
+ _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL;
+ notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+ r = sd_event_loop(m->event);
+ if (r < 0)
+ return log_error_errno(r, "Event loop failed: %m");
+
+ return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c
new file mode 100644
index 0000000..6bd2fed
--- /dev/null
+++ b/src/nsresourced/nsresourcework.c
@@ -0,0 +1,1782 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <linux/nsfs.h>
+#include <linux/veth.h>
+#include <sys/eventfd.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include "sd-daemon.h"
+#include "sd-netlink.h"
+
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "group-record.h"
+#include "io-util.h"
+#include "lock-util.h"
+#include "main-func.h"
+#include "missing_magic.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "netlink-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "time-util.h"
+#include "uid-classification.h"
+#include "uid-range.h"
+#include "user-record-nss.h"
+#include "user-record.h"
+#include "user-util.h"
+#include "userdb.h"
+#include "userns-registry.h"
+#include "userns-restrict.h"
+#include "varlink-io.systemd.NamespaceResource.h"
+#include "varlink-io.systemd.UserDatabase.h"
+#include "varlink.h"
+
+#define ITERATIONS_MAX 64U
+#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE)
+#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC)
+#define CONNECTION_IDLE_USEC (15 * USEC_PER_SEC)
+#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC)
+#define USERNS_PER_UID 256
+
+typedef struct LookupParameters {
+ const char *user_name;
+ const char *group_name;
+ union {
+ uid_t uid;
+ gid_t gid;
+ };
+ const char *service;
+} LookupParameters;
+
+static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, JsonVariant **ret) {
+ _cleanup_free_ char *name = NULL, *realname = NULL;
+ UserDisposition disposition;
+ int r;
+
+ assert(userns_info);
+ assert(offset < userns_info->size);
+
+ if (asprintf(&name, "ns-%s-" UID_FMT, userns_info->name, offset) < 0)
+ return -ENOMEM;
+
+ if (userns_info->size > 1) {
+ disposition = USER_CONTAINER;
+ r = asprintf(&realname, "User " UID_FMT " of Allocated Namespace %s", offset, userns_info->name);
+ } else {
+ disposition = USER_DYNAMIC;
+ r = asprintf(&realname, "Allocated Namespace %s", userns_info->name);
+ }
+ if (r < 0)
+ return -ENOMEM;
+
+ return json_build(ret, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name)),
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(userns_info->start + offset)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)),
+ JSON_BUILD_PAIR("realName", JSON_BUILD_STRING(realname)),
+ JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")),
+ JSON_BUILD_PAIR("shell", JSON_BUILD_STRING(NOLOGIN)),
+ JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)),
+ JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition)))));
+}
+
+static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 },
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ LookupParameters p = {
+ .uid = UID_INVALID,
+ };
+ uid_t offset;
+ int r;
+
+ assert(parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.NamespaceResource"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ if (p.user_name) {
+ _cleanup_free_ char *n = NULL;
+ const char *e, *f;
+
+ e = startswith(p.user_name, "ns-");
+ if (!e)
+ goto not_found;
+
+ f = strrchr(e, '-');
+ if (!f)
+ goto not_found;
+
+ if (parse_uid(f+1, &offset) < 0)
+ goto not_found;
+
+ n = strndup(e, f - e);
+ if (!n)
+ return log_oom();
+
+ r = userns_registry_load_by_name(
+ /* registry_fd= */ -EBADF,
+ n,
+ &userns_info);
+ if (r == -ENOENT)
+ goto not_found;
+ if (r < 0)
+ return r;
+
+ if (offset >= userns_info->size) /* Outside of range? */
+ goto not_found;
+
+ if (uid_is_valid(p.uid) && p.uid != userns_info->start + offset)
+ return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+ } else if (uid_is_valid(p.uid)) {
+ uid_t start, uidmask;
+
+ if (uid_is_container(p.uid))
+ uidmask = (uid_t) UINT32_C(0xFFFF0000);
+ else if (uid_is_dynamic(p.uid))
+ uidmask = (uid_t) UINT32_C(0xFFFFFFFF);
+ else
+ goto not_found;
+
+ start = p.uid & uidmask;
+ offset = p.uid - start;
+
+ r = userns_registry_load_by_start_uid(
+ /* registry_fd= */ -EBADF,
+ start,
+ &userns_info);
+ if (r == -ENOENT)
+ goto not_found;
+ if (r < 0)
+ return r;
+
+ if (offset >= userns_info->size) /* Outside of range? */
+ goto not_found;
+ } else
+ return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL);
+
+ r = build_user_json(userns_info, offset, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v))));
+
+not_found:
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static int build_group_json(UserNamespaceInfo *userns_info, gid_t offset, JsonVariant **ret) {
+ _cleanup_free_ char *name = NULL, *description = NULL;
+ UserDisposition disposition;
+ int r;
+
+ assert(userns_info);
+ assert(offset < userns_info->size);
+
+ if (asprintf(&name, "ns-%s-" GID_FMT, userns_info->name, offset) < 0)
+ return -ENOMEM;
+
+ if (userns_info->size > 1) {
+ disposition = USER_CONTAINER;
+ r = asprintf(&description, "Group " GID_FMT " of Allocated Namespace %s", offset, userns_info->name);
+ } else {
+ disposition = USER_DYNAMIC;
+ r = asprintf(&description, "Allocated Namespace %s", userns_info->name);
+ }
+ if (r < 0)
+ return -ENOMEM;
+
+ return json_build(ret, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(userns_info->start + offset)),
+ JSON_BUILD_PAIR("description", JSON_BUILD_STRING(description)),
+ JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition)))));
+}
+
+static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ LookupParameters p = {
+ .gid = GID_INVALID,
+ };
+ gid_t offset;
+ int r;
+
+ assert(parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.NamespaceResource"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ if (p.group_name) {
+ _cleanup_free_ char *n = NULL;
+ const char *e, *f;
+
+ e = startswith(p.group_name, "ns-");
+ if (!e)
+ goto not_found;
+
+ f = strrchr(e, '-');
+ if (!f)
+ goto not_found;
+
+ if (parse_gid(f+1, &offset) < 0)
+ goto not_found;
+
+ n = strndup(e, f - e);
+ if (!n)
+ return log_oom();
+
+ r = userns_registry_load_by_name(
+ /* registry_fd= */ -EBADF,
+ n,
+ &userns_info);
+ if (r == -ENOENT)
+ goto not_found;
+ if (r < 0)
+ return r;
+
+ if (offset >= userns_info->size) /* Outside of range? */
+ goto not_found;
+
+ if (gid_is_valid(p.gid) && p.uid != userns_info->start + offset)
+ return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+ } else if (gid_is_valid(p.gid)) {
+ gid_t start, gidmask;
+
+ if (gid_is_container(p.gid))
+ gidmask = (gid_t) UINT32_C(0xFFFF0000);
+ else if (gid_is_dynamic(p.gid))
+ gidmask = (gid_t) UINT32_C(0xFFFFFFFF);
+ else
+ goto not_found;
+
+ start = p.gid & gidmask;
+ offset = p.gid - start;
+
+ r = userns_registry_load_by_start_uid(
+ /* registry_fd= */ -EBADF,
+ (uid_t) start,
+ &userns_info);
+ if (r == -ENOENT)
+ goto not_found;
+ if (r < 0)
+ return r;
+
+ if (offset >= userns_info->size) /* Outside of range? */
+ goto not_found;
+ } else
+ return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL);
+
+ r = build_group_json(userns_info, offset, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v))));
+
+not_found:
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+ static const JsonDispatch dispatch_table[] = {
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ LookupParameters p = {};
+ int r;
+
+ assert(parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.NamespaceResource"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ /* We don't support auxiliary groups for namespace allocations */
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static int uid_is_available(
+ int registry_dir_fd,
+ uid_t candidate) {
+
+ int r;
+
+ assert(registry_dir_fd >= 0);
+
+ log_debug("Checking if UID " UID_FMT " is available.", candidate);
+
+ r = userns_registry_uid_exists(registry_dir_fd, candidate);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return false;
+
+ r = userdb_by_uid(candidate, USERDB_AVOID_MULTIPLEXER, NULL);
+ if (r >= 0)
+ return false;
+ if (r != -ESRCH)
+ return r;
+
+ r = groupdb_by_gid(candidate, USERDB_AVOID_MULTIPLEXER, NULL);
+ if (r >= 0)
+ return false;
+ if (r != -ESRCH)
+ return r;
+
+ log_debug("UID " UID_FMT " is available.", candidate);
+
+ return true;
+}
+
+static int name_is_available(
+ int registry_dir_fd,
+ const char *name) {
+
+ _cleanup_free_ char *user_name = NULL;
+ int r;
+
+ assert(registry_dir_fd >= 0);
+ assert(name);
+
+ r = userns_registry_name_exists(registry_dir_fd, name);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return false;
+
+ user_name = strjoin("ns-", name, "-0");
+ if (!user_name)
+ return -ENOMEM;
+
+ r = userdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL);
+ if (r >= 0)
+ return false;
+ if (r != -ESRCH)
+ return r;
+
+ r = groupdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL);
+ if (r >= 0)
+ return false;
+ if (r != -ESRCH)
+ return r;
+
+ log_debug("Namespace name '%s' is available.", name);
+
+ return true;
+}
+
+static int allocate_now(
+ int registry_dir_fd,
+ UserNamespaceInfo *info,
+ int *ret_lock_fd) {
+
+ static const uint8_t hash_key[16] = {
+ 0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd,
+ 0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee,
+ };
+
+ _cleanup_(uid_range_freep) UIDRange *valid_range = NULL;
+ uid_t candidate, uidmin, uidmax, uidmask;
+ unsigned n_tries = 100;
+ int r;
+
+ /* Returns the following error codes:
+ *
+ * EBUSY → all UID candidates we checked are already taken
+ * EEXIST → the name for the userns already exists
+ * EDEADLK → the userns is already registered in the registry
+ */
+
+ assert(registry_dir_fd >= 0);
+ assert(info);
+
+ switch (info->size) {
+
+ case 0x10000U:
+ uidmin = CONTAINER_UID_BASE_MIN;
+ uidmax = CONTAINER_UID_BASE_MAX;
+ uidmask = (uid_t) UINT32_C(0xFFFF0000);
+ break;
+
+ case 1U:
+ uidmin = DYNAMIC_UID_MIN;
+ uidmax = DYNAMIC_UID_MAX;
+ uidmask = (uid_t) UINT32_C(0xFFFFFFFF);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ r = uid_range_load_userns(/* path= */ NULL, UID_RANGE_USERNS_INSIDE, &valid_range);
+ if (r < 0)
+ return r;
+
+ /* Check early whether we have any chance at all given our own uid range */
+ if (!uid_range_overlaps(valid_range, uidmin, uidmax))
+ return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate.");
+
+ _cleanup_close_ int lock_fd = -EBADF;
+ lock_fd = userns_registry_lock(registry_dir_fd);
+ if (lock_fd < 0)
+ return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+ /* Enforce limit on user namespaces per UID */
+ r = userns_registry_per_uid(registry_dir_fd, info->owner);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine number of currently registered user namespaces per UID " UID_FMT ": %m", info->owner);
+ if (r >= USERNS_PER_UID)
+ return log_debug_errno(SYNTHETIC_ERRNO(EUSERS), "User already registered %i user namespaces, refusing.", r);
+
+ r = userns_registry_inode_exists(registry_dir_fd, info->userns_inode);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return -EDEADLK;
+
+ r = name_is_available(registry_dir_fd, info->name);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EEXIST;
+
+ for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */
+ candidate = random_u32()) { /* Use random values afterwards */
+
+ if (--n_tries <= 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available.");
+
+ candidate = (candidate % (uidmax - uidmin)) + uidmin;
+ candidate &= uidmask;
+
+ if (!uid_range_covers(valid_range, candidate, info->size))
+ continue;
+
+ /* We only check the base UID for each range (!) */
+ r = uid_is_available(registry_dir_fd, candidate);
+ if (r < 0)
+ return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate);
+ if (r > 0) {
+ info->start = candidate;
+
+ log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1);
+
+ if (ret_lock_fd)
+ *ret_lock_fd = TAKE_FD(lock_fd);
+
+ return 0;
+ }
+
+ log_debug("UID range " UID_FMT " already taken.", candidate);
+ }
+}
+
+static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) {
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ _cleanup_close_ int efd = -EBADF;
+ uint64_t u;
+ int r;
+
+ assert(usernsfd >= 0);
+ assert(userns_info);
+ assert(uid_is_valid(userns_info->target));
+ assert(uid_is_valid(userns_info->start));
+ assert(userns_info->size > 0);
+ assert(userns_info->size <= UINT32_MAX - userns_info->start);
+
+ efd = eventfd(0, EFD_CLOEXEC);
+ if (efd < 0)
+ return log_error_errno(errno, "Failed to allocate eventfd(): %m");
+
+ r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG, &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* child */
+
+ if (setns(usernsfd, CLONE_NEWUSER) < 0) {
+ log_error_errno(errno, "Failed to join user namespace: %m");
+ goto child_fail;
+ }
+
+ if (eventfd_write(efd, 1) < 0) {
+ log_error_errno(errno, "Failed to ping event fd: %m");
+ goto child_fail;
+ }
+
+ freeze();
+
+ child_fail:
+ _exit(EXIT_FAILURE);
+ }
+
+ /* Wait until child joined the user namespace */
+ if (eventfd_read(efd, &u) < 0)
+ return log_error_errno(errno, "Failed to wait for event fd: %m");
+
+ /* Now write mapping */
+
+ _cleanup_free_ char *pmap = NULL;
+
+ if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pid) < 0)
+ return log_oom();
+
+ r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " " UID_FMT "\n", userns_info->target, userns_info->start, userns_info->size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m");
+
+ pmap = mfree(pmap);
+ if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pid) < 0)
+ return log_oom();
+
+ r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " " GID_FMT "\n", (gid_t) userns_info->target, (gid_t) userns_info->start, (gid_t) userns_info->size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m");
+
+ /* We are done! */
+
+ log_debug("Successfully configured user namespace.");
+ return 0;
+}
+
+static int test_userns_api_support(Varlink *link) {
+ int r;
+
+ assert(link);
+
+ /* We only expose the userns API if our manager daemon told us this OK to do. It will set this
+ * boolean only if it managed to set up BPF correctly for itself (i.e. watches for userns going away
+ * via BPF APIs). This should make very sure we don't accidentally allow any of the userns stuff to
+ * go through without the BPF LSM in effect. */
+
+ r = getenv_bool("NSRESOURCE_API");
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse $NSRESOURCE_API: %m");
+ if (r == 0)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceInterfaceNotSupported", NULL);
+
+ return 0;
+}
+
+static int validate_name(Varlink *link, const char *name, char **ret) {
+ _cleanup_free_ char *un = NULL;
+ int r;
+
+ assert(link);
+ assert(name);
+ assert(ret);
+
+ uid_t peer_uid;
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+
+ if (peer_uid == 0) {
+ if (!userns_name_is_valid(name))
+ return varlink_error_invalid_parameter_name(link, "name");
+
+ un = strdup(name);
+ if (!un)
+ return -ENOMEM;
+ } else {
+ /* The the client is not root then prefix the name with the UID of the peer, so that they
+ * live in separate namespaces and cannot steal each other's names. */
+
+ if (asprintf(&un, UID_FMT "-%s", peer_uid, name) < 0)
+ return -ENOMEM;
+
+ if (!userns_name_is_valid(un))
+ return varlink_error_invalid_parameter_name(link, "name");
+ }
+
+ *ret = TAKE_PTR(un);
+ return 0;
+}
+
+static int validate_target_and_size(Varlink *link, unsigned target, unsigned size) {
+ assert(link);
+
+ if (!IN_SET(size, 1U, 0x10000))
+ return varlink_error_invalid_parameter_name(link, "size");
+
+ if (!uid_is_valid(target) || target > UINT32_MAX - size)
+ return varlink_error_invalid_parameter_name(link, "target");
+
+ return 0;
+}
+
+static int validate_userns(Varlink *link, int userns_fd) {
+ int r;
+
+ assert(link);
+ assert(userns_fd >= 0);
+
+ r = fd_verify_safe_flags(userns_fd);
+ if (r < 0)
+ return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
+
+ /* Validate this is actually a valid user namespace fd */
+ r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m");
+ if (r == 0)
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ /* And refuse the thing if it is our own */
+ r = is_our_namespace(userns_fd, NAMESPACE_USER);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check if user namespace fd refers to our own user namespace: %m");
+ if (r > 0)
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ uid_t peer_uid;
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire peer UID: %m");
+
+ if (peer_uid != 0) {
+ /* Refuse if the userns is not actually owned by our client. */
+ uid_t owner_uid;
+ if (ioctl(userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0)
+ return log_debug_errno(errno, "Failed to get owner UID of user namespace: %m");
+
+ if (owner_uid != peer_uid)
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+ }
+
+ return 0;
+}
+
+static int validate_userns_is_empty(Varlink *link, int userns_fd) {
+ int r;
+
+ assert(link);
+ assert(userns_fd >= 0);
+
+ _cleanup_(uid_range_freep) UIDRange *range = NULL;
+ r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns UID range: %m");
+
+ if (!uid_range_is_empty(range))
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ range = uid_range_free(range);
+ r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns GID range: %m");
+
+ if (!uid_range_is_empty(range))
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ return 0;
+}
+
+typedef struct AllocateParameters {
+ const char *name;
+ unsigned size;
+ unsigned target;
+ unsigned userns_fd_idx;
+} AllocateParameters;
+
+static int vl_method_allocate_user_range(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "name", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AllocateParameters, name), JSON_MANDATORY },
+ { "size", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, size), JSON_MANDATORY },
+ { "target", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, target), 0 },
+ { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), JSON_MANDATORY },
+ {}
+ };
+
+ struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+ _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF, lock_fd = -EBADF;
+ _cleanup_free_ char *userns_name = NULL;
+ uid_t peer_uid;
+ struct stat userns_st;
+ AllocateParameters p = {
+ .size = UINT_MAX,
+ .userns_fd_idx = UINT_MAX,
+ };
+ int r;
+
+ assert(link);
+ assert(parameters);
+
+ r = test_userns_api_support(link);
+ if (r != 0)
+ return r;
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ r = validate_name(link, p.name, &userns_name);
+ if (r != 0)
+ return r;
+
+ r = validate_target_and_size(link, p.target, p.size);
+ if (r != 0)
+ return r;
+
+ userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
+
+ r = validate_userns(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ r = validate_userns_is_empty(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ if (fstat(userns_fd, &userns_st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+
+ if (!*bpf) {
+ r = userns_restrict_install(/* pin= */ true, bpf);
+ if (r < 0)
+ return r;
+ }
+
+ registry_dir_fd = userns_registry_open_fd();
+ if (registry_dir_fd < 0)
+ return registry_dir_fd;
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new();
+ if (!userns_info)
+ return -ENOMEM;
+
+ userns_info->name = TAKE_PTR(userns_name);
+ if (!userns_info->name)
+ return -ENOMEM;
+
+ userns_info->owner = peer_uid;
+ userns_info->userns_inode = userns_st.st_ino;
+ userns_info->size = p.size;
+ userns_info->target = p.target;
+
+ r = allocate_now(registry_dir_fd, userns_info, &lock_fd);
+ if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */
+ return varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL);
+ if (r == -EBUSY) /* All used up */
+ return varlink_error(link, "io.systemd.NamespaceResource.NoDynamicRange", NULL);
+ if (r == -EDEADLK)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL);
+ if (r == -EEXIST)
+ return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL);
+ if (r < 0)
+ return r;
+
+ r = userns_registry_store(registry_dir_fd, userns_info);
+ if (r < 0)
+ return r;
+
+ /* Register the userns in the BPF map with an empty allowlist */
+ r = userns_restrict_put_by_fd(
+ *bpf,
+ userns_fd,
+ /* replace= */ true,
+ /* mount_fds= */ NULL,
+ /* n_mount_fds= */ 0);
+ if (r < 0)
+ goto fail;
+
+ r = write_userns(userns_fd, userns_info);
+ if (r < 0)
+ goto fail;
+
+ lock_fd = safe_close(lock_fd);
+
+ /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */
+ r = sd_pid_notifyf_with_fds(
+ /* pid= */ 0,
+ /* unset_environment= */ false,
+ &userns_fd, 1,
+ "FDSTORE=1\n"
+ "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode);
+ if (r < 0)
+ goto fail;
+
+ /* Note, we'll not return UID values from the host, since the child might not run in the same
+ * user namespace as us. If they want to know the ranges they should read them off the userns fd, so
+ * that they are translated into their PoV */
+ return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+
+fail:
+ /* Note: we don't have to clean-up the BPF maps in the error path: the bpf map type used will
+ * automatically do that once the userns inode goes away */
+ userns_registry_remove(registry_dir_fd, userns_info);
+ return r;
+}
+
+static int validate_userns_is_safe(Varlink *link, int userns_fd) {
+ int r;
+
+ assert(link);
+ assert(userns_fd >= 0);
+
+ /* Read the outside UID range and verify it isn't empty */
+ _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+ r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &outside_range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns UID range: %m");
+ if (uid_range_is_empty(outside_range))
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ /* Read the outside GID range and check it is the same as the UID range */
+ _cleanup_(uid_range_freep) UIDRange *outside_range_gid = NULL;
+ r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &outside_range_gid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns GID range: %m");
+ if (!uid_range_equal(outside_range, outside_range_gid))
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ /* Read the inside UID range, and verify it matches the size of the outside UID range */
+ _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+ r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &inside_range);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns contents: %m");
+ if (uid_range_size(outside_range) != uid_range_size(inside_range))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Uh, inside and outside UID range sizes don't match.");
+
+ /* Read the inside GID range, and verify it matches the inside UID range */
+ _cleanup_(uid_range_freep) UIDRange *inside_range_gid = NULL;
+ r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &inside_range_gid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read userns contents: %m");
+ if (!uid_range_equal(inside_range, inside_range_gid))
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ uid_t peer_uid;
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+
+ uid_t peer_gid;
+ r = varlink_get_peer_gid(link, &peer_gid);
+ if (r < 0)
+ return r;
+
+ /* Insist that the first UID/GID in the range matches the client's UID/GID */
+ if (outside_range->entries[0].start != peer_uid ||
+ outside_range_gid->entries[0].start != peer_gid)
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ /* If there are more than one UID in the range, then also insist that the first UID maps to root inside the userns */
+ if (uid_range_size(outside_range) > 1 && inside_range->entries[0].start != 0)
+ return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+ return 0;
+}
+
+typedef struct RegisterParameters {
+ const char *name;
+ unsigned userns_fd_idx;
+} RegisterParameters;
+
+static int vl_method_register_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "name", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(RegisterParameters, name), JSON_MANDATORY },
+ { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(RegisterParameters, userns_fd_idx), JSON_MANDATORY },
+ {}
+ };
+
+ struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+ _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF;
+ _cleanup_free_ char *userns_name = NULL;
+ uid_t peer_uid;
+ struct stat userns_st;
+ RegisterParameters p = {
+ .userns_fd_idx = UINT_MAX,
+ };
+ int r;
+
+ assert(link);
+ assert(parameters);
+
+ r = test_userns_api_support(link);
+ if (r != 0)
+ return r;
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ r = validate_name(link, p.name, &userns_name);
+ if (r != 0)
+ return r;
+
+ userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+ if (userns_fd < 0)
+ return userns_fd;
+
+ r = validate_userns(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ r = validate_userns_is_safe(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ if (fstat(userns_fd, &userns_st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+
+ if (!*bpf) {
+ r = userns_restrict_install(/* pin= */ true, bpf);
+ if (r < 0)
+ return r;
+ }
+
+ registry_dir_fd = userns_registry_open_fd();
+ if (registry_dir_fd < 0)
+ return registry_dir_fd;
+
+ _cleanup_close_ int lock_fd = -EBADF;
+ lock_fd = userns_registry_lock(registry_dir_fd);
+ if (lock_fd < 0)
+ return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+ r = userns_registry_inode_exists(registry_dir_fd, userns_st.st_ino);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL);
+
+ r = name_is_available(registry_dir_fd, userns_name);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL);
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new();
+ if (!userns_info)
+ return -ENOMEM;
+
+ userns_info->name = TAKE_PTR(userns_name);
+ if (!userns_info->name)
+ return -ENOMEM;
+
+ userns_info->owner = peer_uid;
+ userns_info->userns_inode = userns_st.st_ino;
+
+ r = userns_registry_store(registry_dir_fd, userns_info);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to update userns registry: %m");
+
+ /* Register the userns in the BPF map with an empty allowlist */
+ r = userns_restrict_put_by_fd(
+ *bpf,
+ userns_fd,
+ /* replace= */ true,
+ /* mount_fds= */ NULL,
+ /* n_mount_fds= */ 0);
+ if (r < 0)
+ goto fail;
+
+ /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */
+ r = sd_pid_notifyf_with_fds(
+ /* pid= */ 0,
+ /* unset_environment= */ false,
+ &userns_fd, 1,
+ "FDSTORE=1\n"
+ "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode);
+ if (r < 0)
+ goto fail;
+
+ return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+
+fail:
+ userns_registry_remove(registry_dir_fd, userns_info);
+ return r;
+}
+
+typedef struct AddMountParameters {
+ unsigned userns_fd_idx;
+ unsigned mount_fd_idx;
+} AddMountParameters;
+
+static int vl_method_add_mount_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch parameter_dispatch_table[] = {
+ { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, userns_fd_idx), JSON_MANDATORY },
+ { "mountFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, mount_fd_idx), JSON_MANDATORY },
+ {}
+ };
+
+ _cleanup_close_ int userns_fd = -EBADF, mount_fd = -EBADF, registry_dir_fd = -EBADF;
+ struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+ AddMountParameters p = {
+ .userns_fd_idx = UINT_MAX,
+ .mount_fd_idx = UINT_MAX,
+ };
+ int r, mnt_id = 0;
+ struct stat userns_st;
+ uid_t peer_uid;
+
+ assert(link);
+ assert(parameters);
+
+ r = test_userns_api_support(link);
+ if (r != 0)
+ return r;
+
+ /* Allowlisting arbitrary mounts is a privileged operation */
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+ if (peer_uid != 0)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+ if (userns_fd < 0)
+ return userns_fd;
+
+ r = validate_userns(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ if (fstat(userns_fd, &userns_st) < 0)
+ return -errno;
+
+ mount_fd = varlink_take_fd(link, p.mount_fd_idx);
+ if (mount_fd < 0)
+ return mount_fd;
+
+ r = fd_verify_safe_flags_full(mount_fd, O_PATH|O_DIRECTORY);
+ if (r < 0)
+ return log_debug_errno(r, "Mount file descriptor has unsafe flags set: %m");
+
+ r = fd_verify_directory(mount_fd);
+ if (r < 0)
+ return r;
+
+ r = path_get_mnt_id_at(mount_fd, NULL, &mnt_id);
+ if (r < 0)
+ return r;
+
+ registry_dir_fd = userns_registry_open_fd();
+ if (registry_dir_fd < 0)
+ return registry_dir_fd;
+
+ _cleanup_close_ int lock_fd = -EBADF;
+ lock_fd = userns_registry_lock(registry_dir_fd);
+ if (lock_fd < 0)
+ return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ r = userns_registry_load_by_userns_inode(
+ registry_dir_fd,
+ userns_st.st_ino,
+ &userns_info);
+ if (r == -ENOENT)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+ if (r < 0)
+ return r;
+
+ if (!*bpf) {
+ r = userns_restrict_install(/* pin= */ true, bpf);
+ if (r < 0)
+ return r;
+ }
+
+ /* Pin the mount fd */
+ r = sd_pid_notifyf_with_fds(
+ /* pid= */ 0,
+ /* unset_environment= */ false,
+ &mount_fd, 1,
+ "FDSTORE=1\n"
+ "FDNAME=userns-" INO_FMT "\n", userns_st.st_ino);
+ if (r < 0)
+ return r;
+
+ /* Add this mount to the user namespace's BPF map allowlist entry. */
+ r = userns_restrict_put_by_fd(
+ *bpf,
+ userns_fd,
+ /* replace= */ false,
+ &mount_fd,
+ 1);
+ if (r < 0)
+ return r;
+
+ if (userns_info->size > 0)
+ log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s' @ UID " UID_FMT ")",
+ mnt_id, userns_st.st_ino, userns_info->name, userns_info->start);
+ else
+ log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s')",
+ mnt_id, userns_st.st_ino, userns_info->name);
+
+ return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+}
+
+static int validate_cgroup(Varlink *link, int fd, uint64_t *ret_cgroup_id) {
+ int r;
+
+ assert(link);
+ assert(fd >= 0);
+ assert(ret_cgroup_id);
+
+ r = fd_verify_safe_flags_full(fd, O_DIRECTORY);
+ if (r < 0)
+ return log_debug_errno(r, "Control group file descriptor has unsafe flags set: %m");
+
+ r = fd_verify_directory(fd);
+ if (r < 0)
+ return log_debug_errno(r, "Verification that cgroup fd refers to directory failed: %m");
+
+ r = fd_is_fs_type(fd, CGROUP2_SUPER_MAGIC);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check if cgroup fd actually refers to cgroupfs: %m");
+ if (r == 0)
+ return varlink_error_invalid_parameter_name(link, "controlGroupFileDescriptor");
+
+ r = cg_fd_get_cgroupid(fd, ret_cgroup_id);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read cgroup ID from cgroupfs: %m");
+
+ return 0;
+}
+
+typedef struct AddCGroupParameters {
+ unsigned userns_fd_idx;
+ unsigned cgroup_fd_idx;
+} AddCGroupParameters;
+
+static int vl_method_add_cgroup_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+ static const JsonDispatch parameter_dispatch_table[] = {
+ { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, userns_fd_idx), JSON_MANDATORY },
+ { "controlGroupFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, cgroup_fd_idx), JSON_MANDATORY },
+ {}
+ };
+
+ _cleanup_close_ int userns_fd = -EBADF, cgroup_fd = -EBADF, registry_dir_fd = -EBADF;
+ AddCGroupParameters p = {
+ .userns_fd_idx = UINT_MAX,
+ .cgroup_fd_idx = UINT_MAX,
+ };
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ struct stat userns_st, cgroup_st;
+ uid_t peer_uid;
+ int r;
+
+ assert(link);
+ assert(parameters);
+
+ r = test_userns_api_support(link);
+ if (r != 0)
+ return r;
+
+ r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
+
+ r = validate_userns(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ if (fstat(userns_fd, &userns_st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+
+ cgroup_fd = varlink_take_fd(link, p.cgroup_fd_idx);
+ if (cgroup_fd < 0)
+ return log_debug_errno(cgroup_fd, "Failed to take cgroup fd from Varlink connection: %m");
+
+ uint64_t cgroup_id;
+ r = validate_cgroup(link, cgroup_fd, &cgroup_id);
+ if (r != 0)
+ return r;
+
+ if (fstat(cgroup_fd, &cgroup_st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() cgroup fd: %m");
+
+ registry_dir_fd = userns_registry_open_fd();
+ if (registry_dir_fd < 0)
+ return registry_dir_fd;
+
+ _cleanup_close_ int lock_fd = -EBADF;
+ lock_fd = userns_registry_lock(registry_dir_fd);
+ if (lock_fd < 0)
+ return lock_fd;
+
+ r = userns_registry_load_by_userns_inode(
+ registry_dir_fd,
+ userns_st.st_ino,
+ &userns_info);
+ if (r == -ENOENT)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+ if (r < 0)
+ return r;
+
+ /* The user namespace must have a user assigned */
+ if (userns_info->size == 0)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceWithoutUserRange", NULL);
+ if (userns_info_has_cgroup(userns_info, cgroup_id))
+ return varlink_error(link, "io.systemd.NamespaceResource.ControlGroupAlreadyAdded", NULL);
+ if (userns_info->n_cgroups > USER_NAMESPACE_CGROUPS_DELEGATE_MAX)
+ return varlink_error(link, "io.systemd.NamespaceResource.TooManyControlGroups", NULL);
+
+ /* Registering a cgroup for this client is only allowed for the root or the owner of a userns */
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get connection peer: %m");
+ if (peer_uid != 0) {
+ if (peer_uid != userns_info->owner)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ /* The cgroup must be owned by the owner of the userns */
+ if (cgroup_st.st_uid != userns_info->owner)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+ }
+
+ r = userns_info_add_cgroup(userns_info, cgroup_id);
+ if (r < 0)
+ return r;
+
+ r = userns_registry_store(registry_dir_fd, userns_info);
+ if (r < 0)
+ return r;
+
+ if (fchown(cgroup_fd, userns_info->start, userns_info->start) < 0)
+ return log_debug_errno(errno, "Failed to change ownership of cgroup: %m");
+
+ if (fchmod(cgroup_fd, 0755) < 0)
+ return log_debug_errno(errno, "Failed to change access mode of cgroup: %m");
+
+ FOREACH_STRING(attr, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") {
+ (void) fchmodat(cgroup_fd, attr, 0644, AT_SYMLINK_NOFOLLOW);
+ (void) fchownat(cgroup_fd, attr, userns_info->start, userns_info->start, AT_SYMLINK_NOFOLLOW);
+ }
+
+ log_debug("Granting ownership to cgroup %" PRIu64 " to userns " INO_FMT " ('%s' @ UID " UID_FMT ")",
+ cgroup_id, userns_st.st_ino, userns_info->name, userns_info->start);
+
+ return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+}
+
+static uint64_t hash_ifname_id(UserNamespaceInfo *userns_info, const char *ifname) {
+ struct siphash state;
+
+ assert(userns_info);
+
+ siphash24_init(&state, (const uint8_t[]) { 0xc4, 0x6c, 0x96, 0xe8, 0xad, 0x37, 0x4d, 0x5f, 0xa1, 0xae, 0xfe, 0x70, 0x40, 0xed, 0x41, 0x5f });
+ siphash24_compress_string(userns_info->name, &state);
+ siphash24_compress_byte(0, &state); /* separator */
+ siphash24_compress_string(strempty(ifname), &state);
+
+ return siphash24_finalize(&state);
+}
+
+static void hash_ether_addr(UserNamespaceInfo *userns_info, const char *ifname, uint64_t n, struct ether_addr *ret) {
+ struct siphash state;
+ uint64_t h;
+
+ assert(userns_info);
+ assert(ret);
+
+ siphash24_init(&state, (const uint8_t[]) { 0x36, 0xaa, 0xd1, 0x69, 0xc7, 0xe5, 0x4c, 0xaa, 0x1e, 0xb2, 0x9e, 0xb3, 0x3a, 0x6b, 0xd4, 0x71 });
+ siphash24_compress_string(userns_info->name, &state);
+ siphash24_compress_byte(0, &state); /* separator */
+ siphash24_compress_string(strempty(ifname), &state);
+ siphash24_compress_byte(0, &state); /* separator */
+ n = htole64(n); /* add the 'index' to the mix in an endianess-independent fashion */
+ siphash24_compress(&n, sizeof(n), &state);
+
+ h = htole64(siphash24_finalize(&state));
+
+ assert(sizeof(h) >= sizeof_field(struct ether_addr, ether_addr_octet));
+
+ memcpy(ret->ether_addr_octet, &h, sizeof_field(struct ether_addr, ether_addr_octet));
+ ether_addr_mark_random(ret);
+}
+
+static int create_veth(
+ int netns_fd,
+ const char *ifname_host,
+ const char *altifname_host,
+ struct ether_addr *mac_host,
+ const char *ifname_namespace,
+ struct ether_addr *mac_namespace) {
+
+ int r;
+
+ assert(netns_fd >= 0);
+ assert(ifname_host);
+ assert(mac_host);
+ assert(ifname_namespace);
+ assert(mac_namespace);
+
+ log_debug("Creating veth link on host %s (%s) with address %s to container as %s with address %s",
+ ifname_host, strna(altifname_host), ETHER_ADDR_TO_STR(mac_host),
+ ifname_namespace, ETHER_ADDR_TO_STR(mac_namespace));
+
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return r;
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate netlink message: %m");
+
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_host);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+ r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_host);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink MAC address: %m");
+
+ r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open netlink container: %m");
+
+ r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
+ if (r < 0)
+ return log_error_errno(r, "Failed to open netlink container: %m");
+
+ r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open netlink container: %m");
+
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_namespace);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+ r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_namespace);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink MAC address: %m");
+
+ r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink namespace field: %m");
+
+ r = sd_netlink_message_close_container(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to close netlink container: %m");
+
+ r = sd_netlink_message_close_container(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to close netlink container: %m");
+
+ r = sd_netlink_message_close_container(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to close netlink container: %m");
+
+ r = sd_netlink_call(rtnl, m, 0, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add new veth interfaces (%s:%s): %m", ifname_host, ifname_namespace);
+
+ r = rtnl_set_link_alternative_names_by_ifname(&rtnl, ifname_host, STRV_MAKE(altifname_host));
+ if (r < 0)
+ log_warning_errno(r, "Failed to set alternative interface name to '%s', ignoring: %m", altifname_host);
+
+ return 0;
+}
+
+static int validate_netns(Varlink *link, int userns_fd, int netns_fd) {
+ int r;
+
+ assert(link);
+ assert(userns_fd >= 0);
+ assert(netns_fd >= 0);
+
+ r = fd_verify_safe_flags(netns_fd);
+ if (r < 0)
+ return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m");
+
+ /* Validate this is actually a valid network namespace fd */
+ r = fd_is_ns(netns_fd, CLONE_NEWNET);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+
+ /* And refuse the thing if it is our own */
+ r = is_our_namespace(netns_fd, NAMESPACE_NET);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+
+ /* Check if the netns actually belongs to the userns */
+ _cleanup_close_ int owner_userns_fd = -EBADF;
+ owner_userns_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (owner_userns_fd < 0)
+ return -errno;
+
+ r = inode_same_at(owner_userns_fd, /* path_a= */ NULL, userns_fd, /* path_b= */ NULL, AT_EMPTY_PATH);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+
+ uid_t peer_uid;
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+
+ if (peer_uid != 0) {
+ /* Refuse if the netns is not actually owned by our client. */
+
+ uid_t owner_uid;
+ if (ioctl(owner_userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0)
+ return -errno;
+
+ if (owner_uid != peer_uid)
+ return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+ }
+
+ return 0;
+}
+
+typedef struct AddNetworkParameters {
+ unsigned userns_fd_idx;
+ unsigned netns_fd_idx;
+ const char *ifname;
+ const char *mode;
+} AddNetworkParameters;
+
+static int vl_method_add_netif_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+ static const JsonDispatch parameter_dispatch_table[] = {
+ { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddNetworkParameters, userns_fd_idx), JSON_MANDATORY },
+ { "networkNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddNetworkParameters, netns_fd_idx), JSON_MANDATORY },
+ { "namespaceInterfaceName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AddNetworkParameters, ifname), 0 },
+ { "mode", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AddNetworkParameters, mode), JSON_MANDATORY },
+ {}
+ };
+
+ _cleanup_close_ int userns_fd = -EBADF, netns_fd = -EBADF, registry_dir_fd = -EBADF;
+ AddNetworkParameters p = {
+ .userns_fd_idx = UINT_MAX,
+ };
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ struct stat userns_st;
+ uid_t peer_uid;
+ int r;
+
+ assert(link);
+ assert(parameters);
+
+ r = test_userns_api_support(link);
+ if (r != 0)
+ return r;
+
+ r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+ if (userns_fd < 0)
+ return userns_fd;
+
+ r = validate_userns(link, userns_fd);
+ if (r != 0)
+ return r;
+
+ if (fstat(userns_fd, &userns_st) < 0)
+ return -errno;
+
+ netns_fd = varlink_take_fd(link, p.netns_fd_idx);
+ if (netns_fd < 0)
+ return netns_fd;
+
+ r = validate_netns(link, userns_fd, netns_fd);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.mode, "veth"))
+ return varlink_error_invalid_parameter_name(link, "mode");
+
+ if (p.ifname && !ifname_valid(p.ifname))
+ return varlink_error_invalid_parameter_name(link, "interfaceName");
+
+ registry_dir_fd = userns_registry_open_fd();
+ if (registry_dir_fd < 0)
+ return registry_dir_fd;
+
+ _cleanup_close_ int lock_fd = -EBADF;
+ lock_fd = userns_registry_lock(registry_dir_fd);
+ if (lock_fd < 0)
+ return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+ r = userns_registry_load_by_userns_inode(
+ registry_dir_fd,
+ userns_st.st_ino,
+ &userns_info);
+ if (r == -ENOENT)
+ return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+ if (r < 0)
+ return r;
+
+ /* Registering a network interface for this client is only allowed for the root or the owner of a userns */
+ r = varlink_get_peer_uid(link, &peer_uid);
+ if (r < 0)
+ return r;
+ if (peer_uid != 0 && peer_uid != userns_info->owner)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ _cleanup_free_ char *ifname_host = NULL, *altifname_host = NULL;
+ const char *ifname_namespace = p.ifname ?: "host0";
+
+ /* The short ifname is just too short to generate readable and unique names where unprivileged users
+ * can't take each others names. Hence just hash it. The alternative name however contains more useful
+ * information. */
+ if (asprintf(&ifname_host, "ns-%08" PRIx64, hash_ifname_id(userns_info, p.ifname)) < 0)
+ return -ENOMEM;
+ strshorten(ifname_host, IFNAMSIZ-1);
+
+ if (p.ifname)
+ r = asprintf(&altifname_host, "ns-" UID_FMT "-%s-%s", userns_info->owner, userns_info->name, p.ifname);
+ else
+ r = asprintf(&altifname_host, "ns-" UID_FMT "-%s", userns_info->owner, userns_info->name);
+ if (r < 0)
+ return -ENOMEM;
+
+ struct ether_addr ether_addr_host, ether_addr_namespace;
+
+ hash_ether_addr(userns_info, p.ifname, 0, &ether_addr_host);
+ hash_ether_addr(userns_info, p.ifname, 1, &ether_addr_namespace);
+
+ r = create_veth(netns_fd,
+ ifname_host, altifname_host, &ether_addr_host,
+ ifname_namespace, &ether_addr_namespace);
+ if (r < 0)
+ return r;
+
+ log_debug("Adding veth tunnel %s from host to userns " INO_FMT " ('%s' @ UID " UID_FMT ", interface %s).",
+ ifname_host, userns_st.st_ino, userns_info->name, userns_info->start, ifname_namespace);
+
+ return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hostInterfaceName", JSON_BUILD_STRING(ifname_host)),
+ JSON_BUILD_PAIR("namespaceInterfaceName", JSON_BUILD_STRING(ifname_namespace))));
+}
+
+static int process_connection(VarlinkServer *server, int _fd) {
+ _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */
+ _cleanup_(varlink_close_unrefp) Varlink *vl = NULL;
+ int r;
+
+ r = varlink_server_add_connection(server, fd, &vl);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add connection: %m");
+
+ TAKE_FD(fd);
+ vl = varlink_ref(vl);
+
+ r = varlink_set_allow_fd_passing_input(vl, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable fd passing for read: %m");
+
+ r = varlink_set_allow_fd_passing_output(vl, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable fd passing for write: %m");
+
+ for (;;) {
+ r = varlink_process(vl);
+ if (r == -ENOTCONN) {
+ log_debug("Connection terminated.");
+ break;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to process connection: %m");
+ if (r > 0)
+ continue;
+
+ r = varlink_wait(vl, CONNECTION_IDLE_USEC);
+ if (r < 0)
+ return log_error_errno(r, "Failed to wait for connection events: %m");
+ if (r == 0)
+ break;
+ }
+
+ return 0;
+}
+
+static int run(int argc, char *argv[]) {
+ _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *bpf = NULL;
+ usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
+ _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL;
+ _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
+ unsigned n_iterations = 0;
+ int m, listen_fd, r;
+
+ log_setup();
+
+ m = sd_listen_fds(false);
+ if (m < 0)
+ return log_error_errno(m, "Failed to determine number of listening fds: %m");
+ if (m == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received.");
+ if (m > 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time.");
+
+ listen_fd = SD_LISTEN_FDS_START;
+
+ r = fd_nonblock(listen_fd, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m");
+
+ r = varlink_server_new(&server, VARLINK_SERVER_INHERIT_USERDATA);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate server: %m");
+
+ r = varlink_server_add_interface_many(
+ server,
+ &vl_interface_io_systemd_NamespaceResource,
+ &vl_interface_io_systemd_UserDatabase);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add UserDatabase and NamespaceResource interface to varlink server: %m");
+
+ r = varlink_server_bind_method_many(
+ server,
+ "io.systemd.NamespaceResource.AllocateUserRange", vl_method_allocate_user_range,
+ "io.systemd.NamespaceResource.RegisterUserNamespace", vl_method_register_user_namespace,
+ "io.systemd.NamespaceResource.AddMountToUserNamespace", vl_method_add_mount_to_user_namespace,
+ "io.systemd.NamespaceResource.AddControlGroupToUserNamespace", vl_method_add_cgroup_to_user_namespace,
+ "io.systemd.NamespaceResource.AddNetworkToUserNamespace", vl_method_add_netif_to_user_namespace,
+ "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record,
+ "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
+ "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind methods: %m");
+
+ varlink_server_set_userdata(server, &bpf);
+
+ r = getenv_bool("NSRESOURCE_FIXED_WORKER");
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse NSRESOURCE_FIXED_WORKER: %m");
+ listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC;
+
+ r = pidref_set_parent(&parent);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire pidfd of parent process: %m");
+
+ start_time = now(CLOCK_MONOTONIC);
+
+ for (;;) {
+ _cleanup_close_ int fd = -EBADF;
+ usec_t n;
+
+ /* Exit the worker in regular intervals, to flush out all memory use */
+ if (n_iterations++ > ITERATIONS_MAX) {
+ log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations);
+ break;
+ }
+
+ n = now(CLOCK_MONOTONIC);
+ if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) {
+ log_debug("Exiting worker, ran for %s, that's enough.",
+ FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0));
+ break;
+ }
+
+ if (last_busy_usec == USEC_INFINITY)
+ last_busy_usec = n;
+ else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) {
+ log_debug("Exiting worker, been idle for %s.",
+ FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0));
+ break;
+ }
+
+ (void) rename_process("systemd-nsresourcework: waiting...");
+ fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC));
+ (void) rename_process("systemd-nsresourcework: processing...");
+
+ if (fd == -EAGAIN)
+ continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected
+ * after a while, let's check if it's time to exit though. */
+ if (fd == -EINTR)
+ continue; /* Might be that somebody attached via strace, let's just continue in that
+ * case */
+ if (fd < 0)
+ return log_error_errno(fd, "Failed to accept() from listening socket: %m");
+
+ if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) {
+ /* We only slept a very short time? If so, let's see if there are more sockets
+ * pending, and if so, let's ask our parent for more workers */
+
+ r = fd_wait_for_event(listen_fd, POLLIN, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m");
+
+ if (FLAGS_SET(r, POLLIN)) {
+ r = pidref_kill(&parent, SIGUSR2);
+ if (r == -ESRCH)
+ return log_error_errno(r, "Parent already died?");
+ if (r < 0)
+ return log_error_errno(r, "Failed to send SIGUSR2 signal to parent. %m");
+ }
+ }
+
+ (void) process_connection(server, TAKE_FD(fd));
+ last_busy_usec = USEC_INFINITY;
+ }
+
+ return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/nsresourced/test-userns-restrict.c b/src/nsresourced/test-userns-restrict.c
new file mode 100644
index 0000000..f509321
--- /dev/null
+++ b/src/nsresourced/test-userns-restrict.c
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/eventfd.h>
+
+#include "fd-util.h"
+#include "main-func.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "namespace-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
+#include "tmpfile-util.h"
+#include "userns-restrict.h"
+
+static int make_tmpfs_fsmount(void) {
+ _cleanup_close_ int fsfd = -EBADF, mntfd = -EBADF;
+
+ fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+ assert_se(fsfd >= 0);
+ assert_se(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) >= 0);
+
+ mntfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+ assert_se(mntfd >= 0);
+
+ return TAKE_FD(mntfd);
+}
+
+static void test_works_reg(int parent_fd, const char *fname) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = openat(parent_fd, fname, O_RDWR|O_CREAT|O_CLOEXEC, 0666);
+ assert_se(fd >= 0);
+}
+
+static void test_fails_reg(int parent_fd, const char *fname) {
+ errno = 0;
+ assert_se(openat(parent_fd, fname, O_RDWR|O_CREAT|O_CLOEXEC, 0666) < 0);
+ assert_se(errno == EPERM);
+}
+
+static void test_works_dir(int parent_fd, const char *fname) {
+ assert_se(mkdirat(parent_fd, fname, 0666) >= 0);
+}
+
+static void test_fails_dir(int parent_fd, const char *fname) {
+ errno = 0;
+ assert_se(mkdirat(parent_fd, fname, 0666) < 0);
+ assert_se(errno == EPERM);
+}
+
+static int run(int argc, char *argv[]) {
+ _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
+ _cleanup_close_ int userns_fd = -EBADF, host_fd1 = -EBADF, host_tmpfs = -EBADF, afd = -EBADF, bfd = -EBADF;
+ _cleanup_(rm_rf_physical_and_freep) char *t = NULL;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ int r;
+
+ log_set_max_level(LOG_DEBUG);
+ log_setup();
+
+ r = userns_restrict_install(/* pin= */ false, &obj);
+ if (ERRNO_IS_NOT_SUPPORTED(r)) {
+ log_notice("Skipping test, LSM-BPF logic not supported.");
+ return EXIT_TEST_SKIP;
+ }
+ if (ERRNO_IS_PRIVILEGE(r)) {
+ log_notice("Skipping test, lacking privileges.");
+ return EXIT_TEST_SKIP;
+ }
+ if (r < 0)
+ return r;
+
+ assert_se(mkdtemp_malloc(NULL, &t) >= 0);
+
+ host_fd1 = open(t, O_DIRECTORY|O_CLOEXEC);
+ assert_se(host_fd1 >= 0);
+
+ host_tmpfs = make_tmpfs_fsmount();
+ assert_se(host_tmpfs >= 0);
+
+ userns_fd = userns_acquire("0 0 1", "0 0 1");
+ if (userns_fd < 0)
+ return log_error_errno(userns_fd, "Failed to make user namespace: %m");
+
+ r = userns_restrict_put_by_fd(
+ obj,
+ userns_fd,
+ /* replace= */ true,
+ /* mount_fds= */ NULL,
+ /* n_mount_fds= */ 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to restrict user namespace: %m");
+
+ afd = eventfd(0, EFD_CLOEXEC);
+ bfd = eventfd(0, EFD_CLOEXEC);
+
+ assert_se(afd >= 0 && bfd >= 0);
+
+ r = safe_fork("(test)", FORK_DEATHSIG_SIGKILL, &pid);
+ assert_se(r >= 0);
+ if (r == 0) {
+ _cleanup_close_ int private_tmpfs = -EBADF;
+
+ assert_se(setns(userns_fd, CLONE_NEWUSER) >= 0);
+ assert_se(unshare(CLONE_NEWNS) >= 0);
+
+ /* Allocate tmpfs locally */
+ private_tmpfs = make_tmpfs_fsmount();
+
+ /* These two host mounts should be inaccessible */
+ test_fails_reg(host_fd1, "test");
+ test_fails_reg(host_tmpfs, "xxx");
+ test_fails_dir(host_fd1, "test2");
+ test_fails_dir(host_tmpfs, "xxx2");
+
+ /* But this mount created locally should be fine */
+ test_works_reg(private_tmpfs, "yyy");
+ test_works_dir(private_tmpfs, "yyy2");
+
+ /* Let's sync with the parent, so that it allowlists more stuff for us */
+ assert_se(eventfd_write(afd, 1) >= 0);
+ uint64_t x;
+ assert_se(eventfd_read(bfd, &x) >= 0);
+
+ /* And now we should also have access to the host tmpfs */
+ test_works_reg(host_tmpfs, "zzz");
+ test_works_reg(private_tmpfs, "aaa");
+ test_works_dir(host_tmpfs, "zzz2");
+ test_works_dir(private_tmpfs, "aaa2");
+
+ /* But this one should still fail */
+ test_fails_reg(host_fd1, "bbb");
+ test_fails_dir(host_fd1, "bbb2");
+
+ /* Sync again, to get more stuff allowlisted */
+ assert_se(eventfd_write(afd, 1) >= 0);
+ assert_se(eventfd_read(bfd, &x) >= 0);
+
+ /* Everything should now be allowed */
+ test_works_reg(host_tmpfs, "ccc");
+ test_works_reg(host_fd1, "ddd");
+ test_works_reg(private_tmpfs, "eee");
+ test_works_dir(host_tmpfs, "ccc2");
+ test_works_reg(host_fd1, "ddd2");
+ test_works_dir(private_tmpfs, "eee2");
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ uint64_t x;
+ assert_se(eventfd_read(afd, &x) >= 0);
+
+ r = userns_restrict_put_by_fd(
+ obj,
+ userns_fd,
+ /* replace= */ false,
+ &host_tmpfs,
+ 1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to loosen user namespace: %m");
+
+ assert_se(eventfd_write(bfd, 1) >= 0);
+
+ assert_se(eventfd_read(afd, &x) >= 0);
+
+ r = userns_restrict_put_by_fd(
+ obj,
+ userns_fd,
+ /* replace= */ false,
+ &host_fd1,
+ 1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to loosen user namespace: %m");
+
+ assert_se(eventfd_write(bfd, 1) >= 0);
+
+ assert_se(wait_for_terminate_and_check("(test)", pid, WAIT_LOG) >= 0);
+
+ return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c
new file mode 100644
index 0000000..2cc1b1f
--- /dev/null
+++ b/src/nsresourced/userns-registry.c
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "chase.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "json.h"
+#include "missing_magic.h"
+#include "path-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "user-util.h"
+#include "userns-registry.h"
+
+int userns_registry_open_fd(void) {
+ int fd;
+
+ fd = chase_and_open(
+ "/run/systemd/nsresource/registry",
+ /* root= */ NULL,
+ CHASE_MKDIR_0755,
+ O_CLOEXEC|O_DIRECTORY|O_CREAT,
+ /* ret_path= */ NULL);
+ if (fd < 0)
+ return log_debug_errno(fd, "Failed to open registry dir: %m");
+
+ return fd;
+}
+
+int userns_registry_lock(int dir_fd) {
+ _cleanup_close_ int registry_fd = -EBADF, lock_fd = -EBADF;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ lock_fd = xopenat_lock_full(dir_fd, "lock", O_CREAT|O_RDWR|O_CLOEXEC, /* xopen_flags= */ 0, 0600, LOCK_BSD, LOCK_EX);
+ if (lock_fd < 0)
+ return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+ return TAKE_FD(lock_fd);
+}
+
+UserNamespaceInfo* userns_info_new(void) {
+ UserNamespaceInfo *info = new(UserNamespaceInfo, 1);
+ if (!info)
+ return NULL;
+
+ *info = (UserNamespaceInfo) {
+ .owner = UID_INVALID,
+ .start = UID_INVALID,
+ .target = UID_INVALID,
+ };
+
+ return info;
+}
+
+UserNamespaceInfo *userns_info_free(UserNamespaceInfo *userns) {
+ if (!userns)
+ return NULL;
+
+ free(userns->cgroups);
+ free(userns->name);
+
+ return mfree(userns);
+}
+
+static int dispatch_cgroups_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserNamespaceInfo *info = ASSERT_PTR(userdata);
+ _cleanup_free_ uint64_t *cgroups = NULL;
+ size_t n_cgroups = 0;
+
+ if (json_variant_is_null(variant)) {
+ info->cgroups = mfree(info->cgroups);
+ info->n_cgroups = 0;
+ return 0;
+ }
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+ cgroups = new(uint64_t, json_variant_elements(variant));
+ if (!cgroups)
+ return json_log_oom(variant, flags);
+
+ JsonVariant *e;
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ bool found = false;
+
+ if (!json_variant_is_unsigned(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a number.");
+
+ FOREACH_ARRAY(cg, cgroups, n_cgroups)
+ if (*cg == json_variant_unsigned(e)) {
+ found = true;
+ break;
+ }
+ if (found) /* suppress duplicate */
+ continue;
+
+ cgroups[n_cgroups++] = json_variant_unsigned(e);
+ }
+
+ assert(n_cgroups <= json_variant_elements(variant));
+
+ free_and_replace(info->cgroups, cgroups);
+ info->n_cgroups = n_cgroups;
+
+ return 0;
+}
+
+static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **ret) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "owner", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserNamespaceInfo, owner), JSON_MANDATORY },
+ { "name", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserNamespaceInfo, name), JSON_MANDATORY },
+ { "userns", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(UserNamespaceInfo, userns_inode), JSON_MANDATORY },
+ { "start", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserNamespaceInfo, start), 0 },
+ { "size", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(UserNamespaceInfo, size), 0 },
+ { "target", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserNamespaceInfo, target), 0 },
+ { "cgroups", JSON_VARIANT_ARRAY, dispatch_cgroups_array, 0, 0 },
+ {}
+ };
+
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ _cleanup_close_ int registry_fd = -EBADF;
+ int r;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ r = json_parse_file_at(NULL, dir_fd, fn, 0, &v, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ userns_info = userns_info_new();
+ if (!userns_info)
+ return -ENOMEM;
+
+ r = json_dispatch(v, dispatch_table, 0, userns_info);
+ if (r < 0)
+ return r;
+
+ if (userns_info->userns_inode == 0)
+ return -EBADMSG;
+ if (userns_info->start == 0)
+ return -EBADMSG;
+ if (userns_info->size == 0) {
+ if (uid_is_valid(userns_info->start) || uid_is_valid(userns_info->target))
+ return -EBADMSG;
+ } else {
+ if (!uid_is_valid(userns_info->start) || !uid_is_valid(userns_info->target))
+ return -EBADMSG;
+
+ if (userns_info->size > UINT32_MAX - userns_info->start ||
+ userns_info->size > UINT32_MAX - userns_info->target)
+ return -EBADMSG;
+ }
+
+ if (ret)
+ *ret = TAKE_PTR(userns_info);
+ return 0;
+}
+
+int userns_registry_uid_exists(int dir_fd, uid_t start) {
+ _cleanup_free_ char *fn = NULL;
+
+ assert(dir_fd >= 0);
+
+ if (!uid_is_valid(start))
+ return -ENOENT;
+
+ if (start == 0)
+ return true;
+
+ if (asprintf(&fn, "u" UID_FMT ".userns", start) < 0)
+ return -ENOMEM;
+
+ if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ return errno == ENOENT ? false : -errno;
+
+ return true;
+}
+
+int userns_registry_name_exists(int dir_fd, const char *name) {
+ _cleanup_free_ char *fn = NULL;
+
+ assert(dir_fd >= 0);
+
+ if (!userns_name_is_valid(name))
+ return -EINVAL;
+
+ fn = strjoin("n", name, ".userns");
+ if (!fn)
+ return -ENOMEM;
+
+ if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ return errno == ENOENT ? false : -errno;
+
+ return true;
+}
+
+int userns_registry_inode_exists(int dir_fd, uint64_t inode) {
+ _cleanup_free_ char *fn = NULL;
+
+ assert(dir_fd >= 0);
+
+ if (inode <= 0)
+ return -EINVAL;
+
+ if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0)
+ return -ENOMEM;
+
+ if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ return errno == ENOENT ? false : -errno;
+
+ return true;
+}
+
+int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret) {
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_close_ int registry_fd = -EBADF;
+ _cleanup_free_ char *fn = NULL;
+ int r;
+
+ if (!uid_is_valid(start))
+ return -ENOENT;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ if (asprintf(&fn, "u" UID_FMT ".userns", start) < 0)
+ return -ENOMEM;
+
+ r = userns_registry_load(dir_fd, fn, &userns_info);
+ if (r < 0)
+ return r;
+
+ if (userns_info->start != start)
+ return -EBADMSG;
+
+ if (ret)
+ *ret = TAKE_PTR(userns_info);
+
+ return 0;
+}
+
+int userns_registry_load_by_userns_inode(int dir_fd, uint64_t inode, UserNamespaceInfo **ret) {
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_close_ int registry_fd = -EBADF;
+ _cleanup_free_ char *fn = NULL;
+ int r;
+
+ if (inode == 0)
+ return -ENOENT;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0)
+ return -ENOMEM;
+
+ r = userns_registry_load(dir_fd, fn, &userns_info);
+ if (r < 0)
+ return r;
+
+ if (userns_info->userns_inode != inode)
+ return -EBADMSG;
+
+ if (ret)
+ *ret = TAKE_PTR(userns_info);
+
+ return 0;
+}
+
+int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret) {
+ _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ _cleanup_close_ int registry_fd = -EBADF;
+ _cleanup_free_ char *fn = NULL;
+ int r;
+
+ assert(name);
+
+ if (!userns_name_is_valid(name)) /* Invalid names never exist */
+ return -ENOENT;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ fn = strjoin("n", name, ".userns");
+ if (!fn)
+ return -ENOMEM;
+
+ r = userns_registry_load(dir_fd, fn, &userns_info);
+ if (r < 0)
+ return r;
+
+ if (!streq_ptr(userns_info->name, name))
+ return -EBADMSG;
+
+ if (ret)
+ *ret = TAKE_PTR(userns_info);
+
+ return 0;
+}
+
+int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
+ _cleanup_close_ int registry_fd = -EBADF;
+ int r;
+
+ assert(info);
+
+ if (!uid_is_valid(info->owner) ||
+ !info->name ||
+ info->userns_inode == 0)
+ return -EINVAL;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ _cleanup_(json_variant_unrefp) JsonVariant *cgroup_array = NULL;
+ FOREACH_ARRAY(cg, info->cgroups, info->n_cgroups) {
+ r = json_variant_append_arrayb(
+ &cgroup_array,
+ JSON_BUILD_UNSIGNED(*cg));
+ if (r < 0)
+ return r;
+ }
+
+ _cleanup_(json_variant_unrefp) JsonVariant *def = NULL;
+ r = json_build(&def, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("owner", JSON_BUILD_UNSIGNED(info->owner)),
+ JSON_BUILD_PAIR("name", JSON_BUILD_STRING(info->name)),
+ JSON_BUILD_PAIR("userns", JSON_BUILD_UNSIGNED(info->userns_inode)),
+ JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "start", JSON_BUILD_UNSIGNED(info->start)),
+ JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "size", JSON_BUILD_UNSIGNED(info->size)),
+ JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "target", JSON_BUILD_UNSIGNED(info->target)),
+ JSON_BUILD_PAIR_CONDITION(cgroup_array, "cgroups", JSON_BUILD_VARIANT(cgroup_array))));
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ char *def_buf = NULL;
+ r = json_variant_format(def, 0, &def_buf);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to format userns JSON object: %m");
+
+ _cleanup_free_ char *reg_fn = NULL, *link1_fn = NULL, *link2_fn = NULL, *owner_fn = NULL, *uid_fn = NULL;
+ if (asprintf(&reg_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0)
+ return log_oom_debug();
+
+ r = write_string_file_at(dir_fd, reg_fn, def_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to write userns data to '%s' in registry: %m", reg_fn);
+
+ link1_fn = strjoin("n", info->name, ".userns");
+ if (!link1_fn) {
+ r = log_oom_debug();
+ goto fail;
+ }
+
+ r = linkat_replace(dir_fd, reg_fn, dir_fd, link1_fn);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", link1_fn);
+ goto fail;
+ }
+
+ if (uid_is_valid(info->start)) {
+ if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start) < 0) {
+ r = log_oom_debug();
+ goto fail;
+ }
+
+ r = linkat_replace(dir_fd, reg_fn, dir_fd, link2_fn);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", link2_fn);
+ goto fail;
+ }
+ }
+
+ if (asprintf(&uid_fn, "o" UID_FMT ".owns", info->owner) < 0) {
+ r = log_oom_debug();
+ goto fail;
+ }
+
+ if (mkdirat(dir_fd, uid_fn, 0755) < 0 && errno != EEXIST) {
+ r = log_debug_errno(errno, "Failed to create per-UID subdir '%s' of registry: %m", uid_fn);
+ goto fail;
+ }
+
+ if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0) {
+ r = log_oom_debug();
+ goto fail;
+ }
+
+ r = linkat_replace(dir_fd, reg_fn, dir_fd, owner_fn);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", owner_fn);
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ if (reg_fn)
+ (void) unlinkat(dir_fd, reg_fn, /* flags= */ 0);
+ if (link1_fn)
+ (void) unlinkat(dir_fd, link1_fn, /* flags= */ 0);
+ if (link2_fn)
+ (void) unlinkat(dir_fd, link2_fn, /* flags= */ 0);
+ if (owner_fn)
+ (void) unlinkat(dir_fd, owner_fn, /* flags= */ 0);
+ if (uid_fn)
+ (void) unlinkat(dir_fd, uid_fn, AT_REMOVEDIR);
+
+ return r;
+}
+
+int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
+ _cleanup_close_ int registry_fd = -EBADF;
+ int ret = 0, r;
+
+ assert(info);
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ _cleanup_free_ char *reg_fn = NULL;
+ if (asprintf(&reg_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0)
+ return log_oom_debug();
+
+ ret = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0));
+
+ _cleanup_free_ char *link1_fn = NULL;
+ link1_fn = strjoin("n", info->name, ".userns");
+ if (!link1_fn)
+ return log_oom_debug();
+
+ RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link1_fn, 0)));
+
+ if (uid_is_valid(info->start)) {
+ _cleanup_free_ char *link2_fn = NULL;
+
+ if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start) < 0)
+ return log_oom_debug();
+
+ RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link2_fn, 0)));
+ }
+
+ _cleanup_free_ char *uid_fn = NULL;
+ if (asprintf(&uid_fn, "o" UID_FMT ".owns", info->owner) < 0)
+ return log_oom_debug();
+
+ _cleanup_free_ char *owner_fn = NULL;
+ if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0)
+ return log_oom_debug();
+
+ RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, owner_fn, 0)));
+
+ r = RET_NERRNO(unlinkat(dir_fd, uid_fn, AT_REMOVEDIR));
+ if (r != -ENOTEMPTY)
+ RET_GATHER(ret, r);
+
+ return ret;
+}
+
+bool userns_info_has_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id) {
+ assert(userns);
+
+ FOREACH_ARRAY(i, userns->cgroups, userns->n_cgroups)
+ if (*i == cgroup_id)
+ return true;
+
+ return false;
+}
+
+int userns_info_add_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id) {
+
+ if (userns_info_has_cgroup(userns, cgroup_id))
+ return 0;
+
+ if (!GREEDY_REALLOC(userns->cgroups, userns->n_cgroups+1))
+ return -ENOMEM;
+
+ userns->cgroups[userns->n_cgroups++] = cgroup_id;
+ return 1;
+}
+
+static int userns_destroy_cgroup(uint64_t cgroup_id) {
+ _cleanup_close_ int cgroup_fd = -EBADF, parent_fd = -EBADF;
+ int r;
+
+ cgroup_fd = cg_cgroupid_open(/* cgroupfsfd= */ -EBADF, cgroup_id);
+ if (cgroup_fd == -ESTALE) {
+ log_debug_errno(cgroup_fd, "Control group %" PRIu64 " already gone, ignoring: %m", cgroup_id);
+ return 0;
+ }
+ if (cgroup_fd < 0)
+ return log_debug_errno(errno, "Failed to open cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+ _cleanup_free_ char *path = NULL;
+ r = fd_get_path(cgroup_fd, &path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get path of cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+ const char *e = path_startswith(path, "/sys/fs/cgroup/");
+ if (!e)
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Got cgroup path that doesn't start with /sys/fs/cgroup/, refusing: %s", path);
+ if (isempty(e))
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Got root cgroup path, which can't be right, refusing.");
+
+ log_debug("Path of cgroup %" PRIu64 " is: %s", cgroup_id, path);
+
+ _cleanup_free_ char *fname = NULL;
+ r = path_extract_filename(path, &fname);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to extract name of cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+ parent_fd = openat(cgroup_fd, "..", O_CLOEXEC|O_DIRECTORY);
+ if (parent_fd < 0)
+ return log_debug_errno(errno, "Failed to open parent cgroup of %" PRIu64 ", ignoring: %m", cgroup_id);
+
+ /* Safety check, never leave cgroupfs */
+ r = fd_is_fs_type(parent_fd, CGROUP2_SUPER_MAGIC);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine if parent directory of cgroup %" PRIu64 " is still a cgroup, ignoring: %m", cgroup_id);
+ if (!r)
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Parent directory of cgroup %" PRIu64 " is not a cgroup, refusing.", cgroup_id);
+
+ cgroup_fd = safe_close(cgroup_fd);
+
+ r = rm_rf_child(parent_fd, fname, REMOVE_ONLY_DIRECTORIES|REMOVE_PHYSICAL|REMOVE_CHMOD);
+ if (r < 0)
+ log_debug_errno(r, "Failed to remove delegated cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+ return 0;
+}
+
+int userns_info_remove_cgroups(UserNamespaceInfo *userns) {
+ int ret = 0;
+
+ assert(userns);
+
+ FOREACH_ARRAY(c, userns->cgroups, userns->n_cgroups)
+ RET_GATHER(ret, userns_destroy_cgroup(*c));
+
+ userns->cgroups = mfree(userns->cgroups);
+ userns->n_cgroups = 0;
+
+ return ret;
+}
+
+bool userns_name_is_valid(const char *name) {
+
+ /* Checks if the specified string is suitable as user namespace name. */
+
+ if (strlen(name) > NAME_MAX) /* before we use alloca(), let's check for size */
+ return false;
+
+ const char *f = strjoina("n", name, ".userns"); /* Make sure we can name our lookup symlink with this name */
+ if (!filename_is_valid(f))
+ return false;
+
+ const char *u = strjoina("ns-", name, "-65535"); /* Make sure we can turn this into valid user names */
+ if (!valid_user_group_name(u, 0))
+ return false;
+
+ return true;
+}
+
+int userns_registry_per_uid(int dir_fd, uid_t owner) {
+ _cleanup_close_ int registry_fd = -EBADF;
+ int n = 0, r;
+
+ if (dir_fd < 0) {
+ registry_fd = userns_registry_open_fd();
+ if (registry_fd < 0)
+ return registry_fd;
+
+ dir_fd = registry_fd;
+ }
+
+ _cleanup_free_ char *uid_fn = NULL;
+ if (asprintf(&uid_fn, "o" UID_FMT ".owns", owner) < 0)
+ return log_oom_debug();
+
+ _cleanup_free_ DirectoryEntries *de = NULL;
+
+ r = readdir_all_at(dir_fd, uid_fn, RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE, &de);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to enumerate contents of '%s' sub-directory: %m", uid_fn);
+
+ FOREACH_ARRAY(i, de->entries, de->n_entries) {
+ struct dirent *e = *i;
+
+ if (e->d_type != DT_REG)
+ continue;
+
+ if (!startswith(e->d_name, "i") || !endswith(e->d_name, ".userns"))
+ continue;
+
+ n++;
+
+ if (n == INT_MAX) /* overflow safety check, just in case */
+ break;
+ }
+
+ return n;
+}
diff --git a/src/nsresourced/userns-registry.h b/src/nsresourced/userns-registry.h
new file mode 100644
index 0000000..9e66a6f
--- /dev/null
+++ b/src/nsresourced/userns-registry.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16
+
+typedef struct UserNamespaceInfo {
+ uid_t owner;
+ char *name;
+ uint64_t userns_inode;
+ uid_t start;
+ uint32_t size;
+ uid_t target;
+ uint64_t *cgroups;
+ size_t n_cgroups;
+} UserNamespaceInfo;
+
+UserNamespaceInfo* userns_info_new(void);
+UserNamespaceInfo* userns_info_free(UserNamespaceInfo *userns);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(UserNamespaceInfo*, userns_info_free);
+
+bool userns_info_has_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id);
+int userns_info_add_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id);
+int userns_info_remove_cgroups(UserNamespaceInfo *userns);
+
+bool userns_name_is_valid(const char *name);
+
+int userns_registry_open_fd(void);
+int userns_registry_lock(int dir_fd);
+
+int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret);
+int userns_registry_load_by_userns_inode(int dir_fd, uint64_t userns, UserNamespaceInfo **ret);
+int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret);
+
+int userns_registry_store(int dir_fd, UserNamespaceInfo *info);
+int userns_registry_remove(int dir_fd, UserNamespaceInfo *info);
+
+int userns_registry_inode_exists(int dir_fd, uint64_t inode);
+int userns_registry_name_exists(int dir_fd, const char *name);
+int userns_registry_uid_exists(int dir_fd, uid_t start);
+
+int userns_registry_per_uid(int dir_fd, uid_t owner);
diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c
new file mode 100644
index 0000000..be33f49
--- /dev/null
+++ b/src/nsresourced/userns-restrict.c
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "userns-restrict.h"
+
+#if HAVE_VMLINUX_H
+
+#include <sched.h>
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "lsm-util.h"
+#include "missing_mount.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+
+#define USERNS_MAX (16U*1024U)
+#define MOUNTS_MAX 4096U
+
+#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs"
+#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps"
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+ (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */
+ return NULL;
+}
+
+static int make_inner_hash_map(void) {
+ int fd;
+
+ fd = compat_bpf_map_create(
+ BPF_MAP_TYPE_HASH,
+ NULL,
+ sizeof(int),
+ sizeof(uint32_t),
+ MOUNTS_MAX,
+ NULL);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed allocate inner BPF map: %m");
+
+ return fd;
+}
+
+int userns_restrict_install(
+ bool pin,
+ struct userns_restrict_bpf **ret) {
+
+ _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
+ _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF;
+ int r;
+
+ r = lsm_supported("bpf");
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace.");
+
+ r = dlopen_bpf();
+ if (r < 0)
+ return r;
+
+ /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */
+ if (!sym_bpf_object__next_map)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace.");
+
+ obj = userns_restrict_bpf__open();
+ if (!obj)
+ return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m");
+
+ if (pin) {
+ struct bpf_map *map;
+
+ /* libbpf will only create one level of dirs. Let's create the rest */
+ (void) mkdir_p(MAP_LINK_PREFIX, 0755);
+ (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755);
+
+ map = sym_bpf_object__next_map(obj->obj, NULL);
+ while (map) {
+ _cleanup_free_ char *fn = NULL;
+
+ fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map));
+ if (!fn)
+ return log_oom();
+
+ r = sym_bpf_map__set_pin_path(map, fn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set pin path to '%s': %m", fn);
+
+ map = sym_bpf_object__next_map(obj->obj, map);
+ }
+ }
+
+ r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX);
+ if (r < 0)
+ return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m");
+
+ r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int));
+ if (r < 0)
+ return log_error_errno(r, "Failed to size userns ring buffer: %m");
+
+ /* Dummy map to satisfy the verifier */
+ dummy_mnt_id_hash_fd = make_inner_hash_map();
+ if (dummy_mnt_id_hash_fd < 0)
+ return dummy_mnt_id_hash_fd;
+
+ r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set inner BPF map: %m");
+
+ r = userns_restrict_bpf__load(obj);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load BPF object: %m");
+
+ for (int i = 0; i < obj->skeleton->prog_cnt; i++) {
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+ struct bpf_prog_skeleton *ps = obj->skeleton->progs + i;
+ _cleanup_free_ char *fn = NULL;
+ bool linked = false;
+ const char *e;
+
+ e = startswith(ps->name, "userns_restrict_");
+ assert(e);
+
+ if (pin) {
+ fn = path_join(PROGRAM_LINK_PREFIX, e);
+ if (!fn)
+ return log_oom();
+
+ link = sym_bpf_link__open(fn);
+ r = bpf_get_error_translated(link);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return log_error_errno(r, "Unable to open pinned program link: %m");
+ link = NULL;
+ } else {
+ linked = true;
+ log_info("userns-restrict BPF-LSM program %s already attached.", ps->name);
+ }
+ }
+
+ if (!link) {
+ link = sym_bpf_program__attach(*ps->prog);
+ r = bpf_get_error_translated(link);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach LSM BPF program: %m");
+
+ log_info("userns-restrict BPF-LSM program %s now attached.", ps->name);
+ }
+
+ if (pin && !linked) {
+ assert(fn);
+
+ r = sym_bpf_link__pin(link, fn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin LSM attachment: %m");
+ }
+
+ *ps->link = TAKE_PTR(link);
+ }
+
+ if (pin) {
+ r = sym_bpf_object__pin_maps(obj->obj, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin BPF maps: %m");
+ }
+
+ if (ret)
+ *ret = TAKE_PTR(obj);
+
+ return 0;
+}
+
+int userns_restrict_put_by_inode(
+ struct userns_restrict_bpf *obj,
+ uint64_t userns_inode,
+ bool replace,
+ const int mount_fds[],
+ size_t n_mount_fds) {
+
+ _cleanup_close_ int inner_map_fd = -EBADF;
+ _cleanup_free_ int *mnt_ids = NULL;
+ uint64_t ino = userns_inode;
+ int r, outer_map_fd;
+
+ assert(obj);
+ assert(userns_inode != 0);
+ assert(n_mount_fds == 0 || mount_fds);
+
+ /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode
+ * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit
+ * inode let's refuse early, we can't support this with the current BPF code for now. */
+ if (userns_inode > UINT32_MAX)
+ return -EINVAL;
+
+ mnt_ids = new(int, n_mount_fds);
+ if (!mnt_ids)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < n_mount_fds; i++) {
+ r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get mount ID: %m");
+ }
+
+ outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+ if (outer_map_fd < 0)
+ return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+ if (replace) {
+ /* Add if missing, replace if already exists */
+ inner_map_fd = make_inner_hash_map();
+ if (inner_map_fd < 0)
+ return inner_map_fd;
+
+ r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to replace map in inode hash: %m");
+ } else {
+ /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We
+ * might race against each other, hence we try a couple of times */
+ for (size_t n_try = 10;; n_try--) {
+ uint32_t innermap_id;
+
+ if (n_try == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
+ "Stillcan't create inode entry in BPF map after 10 tries.");
+
+ r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id);
+ if (r >= 0) {
+ inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id);
+ if (inner_map_fd < 0)
+ return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m");
+
+ break;
+ }
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to look up inode hash entry: %m");
+
+ /* No entry for this user namespace yet. Let's create one */
+ inner_map_fd = make_inner_hash_map();
+ if (inner_map_fd < 0)
+ return inner_map_fd;
+
+ r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST);
+ if (r >= 0)
+ break;
+ if (errno != EEXIST)
+ return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m");
+ }
+ }
+
+ FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) {
+ uint32_t dummy_value = 1;
+
+ r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add mount ID to map: %m");
+
+ log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino);
+ }
+
+ return 0;
+}
+
+int userns_restrict_put_by_fd(
+ struct userns_restrict_bpf *obj,
+ int userns_fd,
+ bool replace,
+ const int mount_fds[],
+ size_t n_mount_fds) {
+
+ struct stat st;
+ int r;
+
+ assert(obj);
+ assert(userns_fd >= 0);
+ assert(n_mount_fds == 0 || mount_fds);
+
+ r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+ if (fstat(userns_fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+ return userns_restrict_put_by_inode(
+ obj,
+ st.st_ino,
+ replace,
+ mount_fds,
+ n_mount_fds);
+}
+
+int userns_restrict_reset_by_inode(
+ struct userns_restrict_bpf *obj,
+ uint64_t ino) {
+
+ int r, outer_map_fd;
+ unsigned u;
+
+ assert(obj);
+ assert(ino != 0);
+
+ if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */
+ return 0;
+
+ outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+ if (outer_map_fd < 0)
+ return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+ u = (uint32_t) ino;
+
+ r = sym_bpf_map_delete_elem(outer_map_fd, &u);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino);
+
+ return 0;
+}
+
+#else
+int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+ return NULL;
+}
+
+int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+#endif
diff --git a/src/nsresourced/userns-restrict.h b/src/nsresourced/userns-restrict.h
new file mode 100644
index 0000000..37aed7b
--- /dev/null
+++ b/src/nsresourced/userns-restrict.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+#if HAVE_VMLINUX_H
+#include "bpf/userns_restrict/userns-restrict-skel.h"
+#else
+struct userns_restrict_bpf;
+#endif
+
+int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret);
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj);
+
+int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds);
+int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds);
+
+int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct userns_restrict_bpf*, userns_restrict_bpf_free);