diff options
Diffstat (limited to '')
-rw-r--r-- | src/nsresourced/bpf/userns_restrict/meson.build | 25 | ||||
-rw-r--r-- | src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h | 17 | ||||
-rw-r--r-- | src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c | 179 | ||||
-rw-r--r-- | src/nsresourced/meson.build | 48 | ||||
-rw-r--r-- | src/nsresourced/nsresourced-manager.c | 647 | ||||
-rw-r--r-- | src/nsresourced/nsresourced-manager.h | 40 | ||||
-rw-r--r-- | src/nsresourced/nsresourced.c | 46 | ||||
-rw-r--r-- | src/nsresourced/nsresourcework.c | 1782 | ||||
-rw-r--r-- | src/nsresourced/test-userns-restrict.c | 182 | ||||
-rw-r--r-- | src/nsresourced/userns-registry.c | 646 | ||||
-rw-r--r-- | src/nsresourced/userns-registry.h | 42 | ||||
-rw-r--r-- | src/nsresourced/userns-restrict.c | 346 | ||||
-rw-r--r-- | src/nsresourced/userns-restrict.h | 22 |
13 files changed, 4022 insertions, 0 deletions
diff --git a/src/nsresourced/bpf/userns_restrict/meson.build b/src/nsresourced/bpf/userns_restrict/meson.build new file mode 100644 index 0000000..d773c75 --- /dev/null +++ b/src/nsresourced/bpf/userns_restrict/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('HAVE_VMLINUX_H') != 1 + subdir_done() +endif + +userns_restrict_bpf_o_unstripped = custom_target( + 'userns-restrict.bpf.unstripped.o', + input : 'userns-restrict.bpf.c', + output : 'userns-restrict.bpf.unstripped.o', + command : bpf_o_unstripped_cmd, + depends : vmlinux_h_dependency) + +userns_restrict_bpf_o = custom_target( + 'userns-restrict.bpf.o', + input : userns_restrict_bpf_o_unstripped, + output : 'userns-restrict.bpf.o', + command : bpf_o_cmd) + +userns_restrict_skel_h = custom_target( + 'userns-restrict.skel.h', + input : userns_restrict_bpf_o, + output : 'userns-restrict.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h new file mode 100644 index 0000000..271caf4 --- /dev/null +++ b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include "bpf-dlopen.h" + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__attach_skeleton sym_bpf_object__attach_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton + +#include "bpf/userns_restrict/userns-restrict.skel.h" diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c new file mode 100644 index 0000000..126422b --- /dev/null +++ b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the + * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define + * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0 + * it does not.) */ +#undef offsetof +#undef container_of + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <errno.h> + +#ifndef bpf_core_cast +/* bpf_rdonly_cast() was introduced in libbpf commit 688879f together with + * the definition of a bpf_core_cast macro. So use that one to avoid + * defining a prototype for bpf_rdonly_cast */ +void *bpf_rdonly_cast(void *, __u32) __ksym; +#endif + +/* BPF module that implements an allowlist of mounts (identified by mount ID) for user namespaces (identified + * by their inode number in nsfs) that restricts creation of inodes (which would inherit the callers UID/GID) + * or changing of ownership (similar). + * + * This hooks into the various path-based LSM entrypoints that control inode creation as well as chmod(), and + * then looks up the calling process' user namespace in a global map of namespaces, which points us to + * another map that is simply a list of allowed mnt_ids. */ + +// FIXME: ACL adjustments are currently not blocked. There's no path-based LSM hook available in the kernel +// for setting xattrs or ACLs, hence we cannot easily block them, even though we want that. We can get away +// with ignoring this for now, as ACLs never define ownership, but purely access: i.e. ACLs never allow +// taking possession of an object, but only control access to it. Thus, things like suid access modes should +// not be reachable through it. It still sucks though that a user can persistently add an ACL entry to a file +// with their transient UIDs/GIDs. + +/* kernel currently enforces a maximum usernamespace nesting depth of 32, see create_user_ns() in the kernel sources */ +#define USER_NAMESPACE_DEPTH_MAX 32U + +struct mnt_id_map { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */ + __type(key, int); + __type(value, int); +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */ + __type(key, unsigned); /* userns inode */ + __array(values, struct mnt_id_map); +} userns_mnt_id_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); +} userns_ringbuf SEC(".maps"); + +static inline struct mount *real_mount(struct vfsmount *mnt) { + return container_of(mnt, struct mount, mnt); +} + +static int validate_inode_on_mount(struct inode *inode, struct vfsmount *v) { + struct user_namespace *mount_userns, *task_userns, *p; + unsigned task_userns_inode; + struct task_struct *task; + void *mnt_id_map; + struct mount *m; + int mnt_id; + + /* Get user namespace from vfsmount */ + m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount)); + mount_userns = m->mnt_ns->user_ns; + + /* Get user namespace from task */ + task = (struct task_struct*) bpf_get_current_task_btf(); + task_userns = task->cred->user_ns; + + /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say + * yes immediately. */ + p = mount_userns; + for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) { + if (p == task_userns) + return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */ + + p = p->parent; + if (!p) + break; + } + + /* Hmm, something is fishy if there's more than 32 levels of namespaces involved. Let's better be + * safe than sorry, and refuse. */ + if (p) + return -EPERM; + + /* This is a mount foreign to our task's user namespace, let's consult our allow list */ + task_userns_inode = task_userns->ns.inum; + + mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode); + if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */ + return 0; + + mnt_id = m->mnt_id; + + /* Otherwise, say yes if the mount ID is allowlisted */ + if (bpf_map_lookup_elem(mnt_id_map, &mnt_id)) + return 0; + + return -EPERM; +} + +static int validate_path(const struct path *path, int ret) { + struct inode *inode; + struct vfsmount *v; + + if (ret != 0) /* propagate earlier error */ + return ret; + + inode = path->dentry->d_inode; + v = path->mnt; + + return validate_inode_on_mount(inode, v); +} + +SEC("lsm/path_chown") +int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) { + return validate_path(path, ret); +} + +SEC("lsm/path_mkdir") +int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) { + return validate_path(dir, ret); +} + +SEC("lsm/path_mknod") +int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev, int ret) { + return validate_path(dir, ret); +} + +SEC("lsm/path_symlink") +int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) { + return validate_path(dir, ret); +} + +SEC("lsm/path_link") +int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) { + return validate_path(new_dir, ret); +} + +SEC("kprobe/free_user_ns") +void BPF_KPROBE(userns_restrict_free_user_ns, struct work_struct *work) { + struct user_namespace *userns; + unsigned inode; + void *mnt_id_map; + + /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into + * user namespaces being deleted than using kprobes, but couldn't find any. */ + + userns = bpf_rdonly_cast(container_of(work, struct user_namespace, work), + bpf_core_type_id_kernel(struct user_namespace)); + + inode = userns->ns.inum; + + mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode); + if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */ + return; + + bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0); +} + +static const char _license[] SEC("license") = "GPL"; diff --git a/src/nsresourced/meson.build b/src/nsresourced/meson.build new file mode 100644 index 0000000..cb131f0 --- /dev/null +++ b/src/nsresourced/meson.build @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +subdir('bpf/userns_restrict') + +systemd_nsresourcework_sources = files( + 'nsresourcework.c', + 'userns-restrict.c', + 'userns-registry.c', +) + +systemd_nsresourced_sources = files( + 'nsresourced-manager.c', + 'nsresourced.c', + 'userns-restrict.c', + 'userns-registry.c', +) + +userns_restrict_include = include_directories('.') + +if conf.get('HAVE_VMLINUX_H') == 1 + systemd_nsresourcework_sources += userns_restrict_skel_h + systemd_nsresourced_sources += userns_restrict_skel_h + + executables += [ + test_template + { + 'sources' : files('test-userns-restrict.c', 'userns-restrict.c') + userns_restrict_skel_h, + 'conditions' : ['ENABLE_NSRESOURCED', 'HAVE_VMLINUX_H'], + 'include_directories' : [ includes, userns_restrict_include ], + }, + ] +endif + +executables += [ + libexec_template + { + 'name' : 'systemd-nsresourcework', + 'conditions' : ['ENABLE_NSRESOURCED'], + 'sources' : systemd_nsresourcework_sources, + 'dependencies' : threads, + 'include_directories' : [ includes, userns_restrict_include ], + }, + libexec_template + { + 'name' : 'systemd-nsresourced', + 'conditions' : ['ENABLE_NSRESOURCED'], + 'sources' : systemd_nsresourced_sources, + 'dependencies' : threads, + 'include_directories' : [ includes, userns_restrict_include ], + }, +] diff --git a/src/nsresourced/nsresourced-manager.c b/src/nsresourced/nsresourced-manager.c new file mode 100644 index 0000000..d87da58 --- /dev/null +++ b/src/nsresourced/nsresourced-manager.c @@ -0,0 +1,647 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/mount.h> +#include <sys/wait.h> + +#include "sd-daemon.h" + +#include "bpf-dlopen.h" +#include "build-path.h" +#include "common-signal.h" +#include "env-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "nsresourced-manager.h" +#include "parse-util.h" +#include "process-util.h" +#include "recurse-dir.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "umask-util.h" +#include "unaligned.h" +#include "user-util.h" +#include "userns-registry.h" +#include "userns-restrict.h" + +#define LISTEN_TIMEOUT_USEC (25 * USEC_PER_SEC) + +static int start_workers(Manager *m, bool explicit_request); + +static int on_worker_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + assert_se(!set_remove(m->workers_dynamic, s) != !set_remove(m->workers_fixed, s)); + sd_event_source_disable_unref(s); + + if (si->si_code == CLD_EXITED) { + if (si->si_status == EXIT_SUCCESS) + log_debug("Worker " PID_FMT " exited successfully.", si->si_pid); + else + log_warning("Worker " PID_FMT " died with a failure exit status %i, ignoring.", si->si_pid, si->si_status); + } else if (si->si_code == CLD_KILLED) + log_warning("Worker " PID_FMT " was killed by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status)); + else if (si->si_code == CLD_DUMPED) + log_warning("Worker " PID_FMT " dumped core by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status)); + else + log_warning("Got unexpected exit code via SIGCHLD, ignoring."); + + (void) start_workers(m, /* explicit_request= */ false); /* Fill up workers again if we fell below the low watermark */ + return 0; +} + +static int on_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + (void) start_workers(m, /* explicit_request=*/ true); /* Workers told us there's more work, let's add one more worker as long as we are below the high watermark */ + return 0; +} + +static int on_deferred_start_worker(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source); + + (void) start_workers(m, /* explicit_request=*/ false); + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + event_source_hash_ops, + sd_event_source, + (void (*)(const sd_event_source*, struct siphash*)) trivial_hash_func, + (int (*)(const sd_event_source*, const sd_event_source*)) trivial_compare_func, + sd_event_source_disable_unref); + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .listen_fd = -EBADF, + .worker_ratelimit = { + .interval = 2 * USEC_PER_SEC, + .burst = 250, + }, + .registry_fd = -EBADF, + }; + + r = sd_event_new(&m->event); + if (r < 0) + return r; + + r = sd_event_set_signal_exit(m->event, true); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, (SIGRTMIN+18)|SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL); + if (r < 0) + return r; + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + r = sd_event_set_watchdog(m->event, true); + if (r < 0) + log_debug_errno(r, "Failed to enable watchdog handling, ignoring: %m"); + + r = sd_event_add_signal(m->event, NULL, SIGUSR2|SD_EVENT_SIGNAL_PROCMASK, on_sigusr2, m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + set_free(m->workers_fixed); + set_free(m->workers_dynamic); + + m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source); + + safe_close(m->listen_fd); + +#if HAVE_VMLINUX_H + sd_event_source_disable_unref(m->userns_restrict_bpf_ring_buffer_event_source); + if (m->userns_restrict_bpf_ring_buffer) + sym_ring_buffer__free(m->userns_restrict_bpf_ring_buffer); + userns_restrict_bpf_free(m->userns_restrict_bpf); +#endif + + safe_close(m->registry_fd); + + sd_event_unref(m->event); + + return mfree(m); +} + +static size_t manager_current_workers(Manager *m) { + assert(m); + + return set_size(m->workers_fixed) + set_size(m->workers_dynamic); +} + +static int start_one_worker(Manager *m) { + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source = NULL; + bool fixed; + pid_t pid; + int r; + + assert(m); + + fixed = set_size(m->workers_fixed) < NSRESOURCE_WORKERS_MIN; + + r = safe_fork_full( + "(sd-worker)", + /* stdio_fds= */ NULL, + &m->listen_fd, 1, + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_CLOSE_ALL_FDS, + &pid); + if (r < 0) + return log_error_errno(r, "Failed to fork new worker child: %m"); + if (r == 0) { + char pids[DECIMAL_STR_MAX(pid_t)]; + /* Child */ + + if (m->listen_fd == 3) { + r = fd_cloexec(3, false); + if (r < 0) { + log_error_errno(r, "Failed to turn off O_CLOEXEC for fd 3: %m"); + _exit(EXIT_FAILURE); + } + } else { + if (dup2(m->listen_fd, 3) < 0) { /* dup2() creates with O_CLOEXEC off */ + log_error_errno(errno, "Failed to move listen fd to 3: %m"); + _exit(EXIT_FAILURE); + } + + safe_close(m->listen_fd); + } + + xsprintf(pids, PID_FMT, pid); + if (setenv("LISTEN_PID", pids, 1) < 0) { + log_error_errno(errno, "Failed to set $LISTEN_PID: %m"); + _exit(EXIT_FAILURE); + } + + if (setenv("LISTEN_FDS", "1", 1) < 0) { + log_error_errno(errno, "Failed to set $LISTEN_FDS: %m"); + _exit(EXIT_FAILURE); + } + + if (setenv("NSRESOURCE_FIXED_WORKER", one_zero(fixed), 1) < 0) { + log_error_errno(errno, "Failed to set $NSRESOURCE_FIXED_WORKER: %m"); + _exit(EXIT_FAILURE); + } + +#if HAVE_VMLINUX_H + bool supported = m->userns_restrict_bpf; +#else + bool supported = false; +#endif + + /* Tell the workers whether to enable the userns API */ + if (setenv("NSRESOURCE_API", one_zero(supported), 1) < 0) { + log_error_errno(errno, "Failed to set $NSRESOURCE_API: %m"); + _exit(EXIT_FAILURE); + } + + r = setenv_systemd_log_level(); + if (r < 0) { + log_error_errno(r, "Failed to set $SYSTEMD_LOG_LEVEL: %m"); + _exit(EXIT_FAILURE); + } + + r = invoke_callout_binary(SYSTEMD_NSRESOURCEWORK_PATH, STRV_MAKE("systemd-nsresourcework", "xxxxxxxxxxxxxxxx")); /* With some extra space rename_process() can make use of */ + log_error_errno(r, "Failed start worker process: %m"); + _exit(EXIT_FAILURE); + } + + r = sd_event_add_child(m->event, &source, pid, WEXITED, on_worker_exit, m); + if (r < 0) + return log_error_errno(r, "Failed to watch child " PID_FMT ": %m", pid); + + r = set_ensure_put( + fixed ? &m->workers_fixed : &m->workers_dynamic, + &event_source_hash_ops, + source); + if (r < 0) + return log_error_errno(r, "Failed to add child process to set: %m"); + + TAKE_PTR(source); + + return 0; +} + +static int start_workers(Manager *m, bool explicit_request) { + int r; + + assert(m); + + for (;;) { + size_t n; + + n = manager_current_workers(m); + if (n >= NSRESOURCE_WORKERS_MIN && (!explicit_request || n >= NSRESOURCE_WORKERS_MAX)) + break; + + if (!ratelimit_below(&m->worker_ratelimit)) { + + /* If we keep starting workers too often but none sticks, let's fail the whole + * daemon, something is wrong */ + if (n == 0) { + sd_event_exit(m->event, EXIT_FAILURE); + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "Worker threads requested too frequently, but worker count is zero, something is wrong."); + } + + /* Otherwise, let's stop spawning more for a while. */ + log_warning("Worker threads requested too frequently, not starting new ones for a while."); + + if (!m->deferred_start_worker_event_source) { + r = sd_event_add_time( + m->event, + &m->deferred_start_worker_event_source, + CLOCK_MONOTONIC, + ratelimit_end(&m->worker_ratelimit), + /* accuracy_usec= */ 0, + on_deferred_start_worker, + m); + if (r < 0) + return log_error_errno(r, "Failed to allocate deferred start worker event source: %m"); + } + + break; + } + + r = start_one_worker(m); + if (r < 0) + return r; + + explicit_request = false; + } + + return 0; +} + +static void manager_release_userns_bpf(Manager *m, uint64_t inode) { +#if HAVE_VMLINUX_H + int r; + + assert(m); + + if (inode == 0) + return; + + assert(m->userns_restrict_bpf); + + r = userns_restrict_reset_by_inode(m->userns_restrict_bpf, inode); + if (r < 0) + return (void) log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m"); +#endif +} + +static void manager_release_userns_fds(Manager *m, uint64_t inode) { + int r; + + assert(m); + assert(inode != 0); + + r = sd_notifyf(/* unset_environment= */ false, + "FDSTOREREMOVE=1\n" + "FDNAME=userns-%" PRIu64 "\n", inode); + if (r < 0) + log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m"); +} + +static void manager_release_userns_by_inode(Manager *m, uint64_t inode) { + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_close_ int lock_fd = -EBADF; + int r; + + assert(m); + assert(inode != 0); + + lock_fd = userns_registry_lock(m->registry_fd); + if (lock_fd < 0) + return (void) log_error_errno(lock_fd, "Failed to lock registry: %m"); + + r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info); + if (r < 0) + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode); + + if (userns_info && uid_is_valid(userns_info->start)) + log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", inode, userns_info->start); + else + log_debug("Removing user namespace mapping %" PRIu64 ".", inode); + + /* Remove the BPF rules */ + manager_release_userns_bpf(m, inode); + + /* Remove the resources from the fdstore */ + manager_release_userns_fds(m, inode); + + /* And finally remove the resources file from disk */ + if (userns_info) { + /* Remove the cgroups of this userns */ + r = userns_info_remove_cgroups(userns_info); + if (r < 0) + log_warning_errno(r, "Failed to remove cgroups of user namespace: %m"); + + r = userns_registry_remove(m->registry_fd, userns_info); + if (r < 0) + log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", userns_info->name); + } +} + +static int manager_scan_registry(Manager *m, Set **registry_inodes) { + _cleanup_free_ DirectoryEntries *de = NULL; + int r; + + assert(m); + assert(registry_inodes); + assert(m->registry_fd >= 0); + + r = readdir_all(m->registry_fd, RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + return log_error_errno(r, "Failed to enumerate registry."); + + for (size_t i = 0; i < de->n_entries; i++) { + struct dirent *dentry = de->entries[i]; + _cleanup_free_ char *u = NULL; + const char *e, *p; + uint64_t inode; + + p = startswith(dentry->d_name, "i"); + if (!p) + continue; + + e = endswith(p, ".userns"); + if (!e) + continue; + + u = strndup(p, e - p); + if (!u) + return log_oom(); + + r = safe_atou64(u, &inode); + if (r < 0) { + log_warning_errno(r, "Failed to parse userns inode number from '%s', skipping: %m", dentry->d_name); + continue; + } + + if (inode > UINT32_MAX) { /* namespace inode numbers are 23bit only right now */ + log_warning("userns inode number outside of 32bit range, skipping."); + continue; + } + + if (set_ensure_put(registry_inodes, NULL, UINT32_TO_PTR(inode)) < 0) + return log_oom(); + + log_debug("Found user namespace %" PRIu64 " in registry directory", inode); + } + + return 0; +} + +static int manager_make_listen_socket(Manager *m) { + static const union sockaddr_union sockaddr = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/io.systemd.NamespaceResource", + }; + int r; + + assert(m); + + if (m->listen_fd >= 0) + return 0; + + m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (m->listen_fd < 0) + return log_error_errno(errno, "Failed to bind on socket: %m"); + + (void) sockaddr_un_unlink(&sockaddr.un); + + WITH_UMASK(0000) + if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0) + return log_error_errno(errno, "Failed to bind socket: %m"); + + r = mkdir_p("/run/systemd/userdb", 0755); + if (r < 0) + return log_error_errno(r, "Failed to create /run/systemd/userdb: %m"); + + r = symlink_idempotent("../io.systemd.NamespaceResource", "/run/systemd/userdb/io.systemd.NamespaceResource", /* make_relative= */ false); + if (r < 0) + return log_error_errno(r, "Failed to symlink userdb socket: %m"); + + if (listen(m->listen_fd, SOMAXCONN) < 0) + return log_error_errno(errno, "Failed to listen on socket: %m"); + + return 1; +} + +static int manager_scan_listen_fds(Manager *m, Set **fdstore_inodes) { + _cleanup_strv_free_ char **names = NULL; + int n, r; + + assert(m); + assert(fdstore_inodes); + + n = sd_listen_fds_with_names(/* unset_environment= */ true, &names); + if (n < 0) + return log_error_errno(n, "Failed to determine number of passed file descriptors: %m"); + + for (int i = 0; i < n; i++) { + _cleanup_close_ int fd = SD_LISTEN_FDS_START + i; /* Take possession */ + const char *e; + + /* If this is a BPF allowlist related fd, just close it, but remember which start UIDs this covers */ + e = startswith(names[i], "userns-"); + if (e) { + uint64_t inode; + + r = safe_atou64(e, &inode); + if (r < 0) { + log_warning_errno(r, "Failed to parse UID from fd name '%s', ignoring: %m", e); + continue; + } + + if (inode > UINT32_MAX) { + log_warning("Inode number outside of 32bit range, ignoring"); + continue; + } + + if (set_ensure_put(fdstore_inodes, NULL, UINT32_TO_PTR(inode)) < 0) + return log_oom(); + + continue; + } + + /* We don't check the name for the stream socket, for compatibility with older versions */ + r = sd_is_socket(fd, AF_UNIX, SOCK_STREAM, 1); + if (r < 0) + return log_error_errno(r, "Failed to detect if passed file descriptor is a socket: %m"); + if (r > 0) { + if (m->listen_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Passed more than one AF_UNIX/SOCK_STREAM socket, refusing."); + + m->listen_fd = TAKE_FD(fd); + continue; + } + + log_warning("Closing passed file descriptor %i (%s) we don't recognize.", fd, names[i]); + } + + return 0; +} + +#if HAVE_VMLINUX_H +static int ringbuf_event(void *userdata, void *data, size_t size) { + Manager *m = ASSERT_PTR(userdata); + size_t n; + + if ((size % sizeof(unsigned int)) != 0) /* Not multiples of "unsigned int"? */ + return -EIO; + + n = size / sizeof(unsigned int); + for (size_t i = 0; i < n; i++) { + const void *d; + uint64_t inode; + + d = (const uint8_t*) data + i * sizeof(unsigned int); + inode = unaligned_read_ne32(d); + + log_debug("Got BPF ring buffer notification that user namespace %" PRIu64 " is now dead.", inode); + manager_release_userns_by_inode(m, inode); + } + + return 0; +} + +static int on_ringbuf_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + int r; + + r = sym_ring_buffer__poll(m->userns_restrict_bpf_ring_buffer, 0); + if (r < 0) + return log_error_errno(r, "Got failure reading from BPF ring buffer: %m"); + + return 0; +} + +static int manager_setup_bpf(Manager *m) { + int rb_fd = -EBADF, poll_fd = -EBADF, r; + + assert(m); + assert(!m->userns_restrict_bpf); + assert(!m->userns_restrict_bpf_ring_buffer); + assert(!m->userns_restrict_bpf_ring_buffer_event_source); + + r = userns_restrict_install(/* pin= */ true, &m->userns_restrict_bpf); + if (r < 0) { + log_notice_errno(r, "Proceeding with user namespace interfaces disabled."); + return 0; + } + + rb_fd = sym_bpf_map__fd(m->userns_restrict_bpf->maps.userns_ringbuf); + if (rb_fd < 0) + return log_error_errno(rb_fd, "Failed to get fd of ring buffer: %m"); + + m->userns_restrict_bpf_ring_buffer = sym_ring_buffer__new(rb_fd, ringbuf_event, m, NULL); + if (!m->userns_restrict_bpf_ring_buffer) + return log_error_errno(errno, "Failed to allocate BPF ring buffer object: %m"); + + poll_fd = sym_ring_buffer__epoll_fd(m->userns_restrict_bpf_ring_buffer); + if (poll_fd < 0) + return log_error_errno(poll_fd, "Failed to get poll fd of ring buffer: %m"); + + r = sd_event_add_io( + m->event, + &m->userns_restrict_bpf_ring_buffer_event_source, + poll_fd, + EPOLLIN, + on_ringbuf_io, + m); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source for BPF ring buffer: %m"); + + return 0; +} +#else +static int manager_setup_bpf(Manager *m) { + log_notice("Not setting up BPF subsystem, as functionality has been disabled at compile time."); + return 0; +} +#endif + +int manager_startup(Manager *m) { + _cleanup_(set_freep) Set *fdstore_inodes = NULL, *registry_inodes = NULL; + void *p; + int r; + + assert(m); + assert(m->registry_fd < 0); + assert(m->listen_fd < 0); + + m->registry_fd = userns_registry_open_fd(); + if (m->registry_fd < 0) + return log_error_errno(m->registry_fd, "Failed to open registry directory: %m"); + + r = manager_setup_bpf(m); + if (r < 0) + return r; + + r = manager_scan_listen_fds(m, &fdstore_inodes); + if (r < 0) + return r; + + r = manager_scan_registry(m, ®istry_inodes); + if (r < 0) + return r; + + /* If there are resources tied to UIDs not found in the registry, then release them */ + SET_FOREACH(p, fdstore_inodes) { + uint64_t inode; + + if (set_contains(registry_inodes, p)) + continue; + + inode = PTR_TO_UINT32(p); + + log_debug("Found stale fd store entry for user namespace %" PRIu64 ", removing.", inode); + manager_release_userns_by_inode(m, inode); + } + + r = manager_make_listen_socket(m); + if (r < 0) + return r; + + /* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be + * GC'ed on idle */ + if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0) + return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m"); + + r = start_workers(m, /* explicit_request= */ false); + if (r < 0) + return r; + + return 0; +} diff --git a/src/nsresourced/nsresourced-manager.h b/src/nsresourced/nsresourced-manager.h new file mode 100644 index 0000000..5ecf378 --- /dev/null +++ b/src/nsresourced/nsresourced-manager.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-event.h" + +typedef struct Manager Manager; + +#include "hashmap.h" +#include "ratelimit.h" + +#define NSRESOURCE_WORKERS_MIN 5 +#define NSRESOURCE_WORKERS_MAX 4096 + +struct Manager { + sd_event *event; + + Set *workers_fixed; /* Workers 0…NSRESOURCE_WORKERS_MIN */ + Set *workers_dynamic; /* Workers NSRESOURCES_WORKERS_MIN+1…NSRESOURCES_WORKERS_MAX */ + + int listen_fd; + + RateLimit worker_ratelimit; + + sd_event_source *deferred_start_worker_event_source; + +#if HAVE_VMLINUX_H + struct userns_restrict_bpf *userns_restrict_bpf; + struct ring_buffer *userns_restrict_bpf_ring_buffer; + sd_event_source *userns_restrict_bpf_ring_buffer_event_source; +#endif + + int registry_fd; +}; + +int manager_new(Manager **ret); +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_startup(Manager *m); diff --git a/src/nsresourced/nsresourced.c b/src/nsresourced/nsresourced.c new file mode 100644 index 0000000..7056897 --- /dev/null +++ b/src/nsresourced/nsresourced.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/stat.h> +#include <sys/types.h> + +#include "daemon-util.h" +#include "nsresourced-manager.h" +#include "log.h" +#include "main-func.h" +#include "signal-util.h" + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + log_setup(); + + umask(0022); + + if (argc != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments."); + + if (setenv("SYSTEMD_BYPASS_USERDB", "io.systemd.NamespaceResource", 1) < 0) + return log_error_errno(errno, "Failed to set $SYSTEMD_BYPASS_USERDB: %m"); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Could not create manager: %m"); + + r = manager_startup(m); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + + _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL; + notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c new file mode 100644 index 0000000..6bd2fed --- /dev/null +++ b/src/nsresourced/nsresourcework.c @@ -0,0 +1,1782 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <linux/nsfs.h> +#include <linux/veth.h> +#include <sys/eventfd.h> +#include <sys/stat.h> +#include <sys/wait.h> + +#include "sd-daemon.h" +#include "sd-netlink.h" + +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "group-record.h" +#include "io-util.h" +#include "lock-util.h" +#include "main-func.h" +#include "missing_magic.h" +#include "missing_mount.h" +#include "missing_syscall.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "netlink-util.h" +#include "process-util.h" +#include "random-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "strv.h" +#include "time-util.h" +#include "uid-classification.h" +#include "uid-range.h" +#include "user-record-nss.h" +#include "user-record.h" +#include "user-util.h" +#include "userdb.h" +#include "userns-registry.h" +#include "userns-restrict.h" +#include "varlink-io.systemd.NamespaceResource.h" +#include "varlink-io.systemd.UserDatabase.h" +#include "varlink.h" + +#define ITERATIONS_MAX 64U +#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE) +#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC) +#define CONNECTION_IDLE_USEC (15 * USEC_PER_SEC) +#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC) +#define USERNS_PER_UID 256 + +typedef struct LookupParameters { + const char *user_name; + const char *group_name; + union { + uid_t uid; + gid_t gid; + }; + const char *service; +} LookupParameters; + +static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, JsonVariant **ret) { + _cleanup_free_ char *name = NULL, *realname = NULL; + UserDisposition disposition; + int r; + + assert(userns_info); + assert(offset < userns_info->size); + + if (asprintf(&name, "ns-%s-" UID_FMT, userns_info->name, offset) < 0) + return -ENOMEM; + + if (userns_info->size > 1) { + disposition = USER_CONTAINER; + r = asprintf(&realname, "User " UID_FMT " of Allocated Namespace %s", offset, userns_info->name); + } else { + disposition = USER_DYNAMIC; + r = asprintf(&realname, "Allocated Namespace %s", userns_info->name); + } + if (r < 0) + return -ENOMEM; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(userns_info->start + offset)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)), + JSON_BUILD_PAIR("realName", JSON_BUILD_STRING(realname)), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")), + JSON_BUILD_PAIR("shell", JSON_BUILD_STRING(NOLOGIN)), + JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition))))); +} + +static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 }, + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .uid = UID_INVALID, + }; + uid_t offset; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.NamespaceResource")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (p.user_name) { + _cleanup_free_ char *n = NULL; + const char *e, *f; + + e = startswith(p.user_name, "ns-"); + if (!e) + goto not_found; + + f = strrchr(e, '-'); + if (!f) + goto not_found; + + if (parse_uid(f+1, &offset) < 0) + goto not_found; + + n = strndup(e, f - e); + if (!n) + return log_oom(); + + r = userns_registry_load_by_name( + /* registry_fd= */ -EBADF, + n, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + + if (uid_is_valid(p.uid) && p.uid != userns_info->start + offset) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + } else if (uid_is_valid(p.uid)) { + uid_t start, uidmask; + + if (uid_is_container(p.uid)) + uidmask = (uid_t) UINT32_C(0xFFFF0000); + else if (uid_is_dynamic(p.uid)) + uidmask = (uid_t) UINT32_C(0xFFFFFFFF); + else + goto not_found; + + start = p.uid & uidmask; + offset = p.uid - start; + + r = userns_registry_load_by_start_uid( + /* registry_fd= */ -EBADF, + start, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + } else + return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL); + + r = build_user_json(userns_info, offset, &v); + if (r < 0) + return r; + + return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v)))); + +not_found: + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static int build_group_json(UserNamespaceInfo *userns_info, gid_t offset, JsonVariant **ret) { + _cleanup_free_ char *name = NULL, *description = NULL; + UserDisposition disposition; + int r; + + assert(userns_info); + assert(offset < userns_info->size); + + if (asprintf(&name, "ns-%s-" GID_FMT, userns_info->name, offset) < 0) + return -ENOMEM; + + if (userns_info->size > 1) { + disposition = USER_CONTAINER; + r = asprintf(&description, "Group " GID_FMT " of Allocated Namespace %s", offset, userns_info->name); + } else { + disposition = USER_DYNAMIC; + r = asprintf(&description, "Allocated Namespace %s", userns_info->name); + } + if (r < 0) + return -ENOMEM; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(userns_info->start + offset)), + JSON_BUILD_PAIR("description", JSON_BUILD_STRING(description)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition))))); +} + +static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .gid = GID_INVALID, + }; + gid_t offset; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.NamespaceResource")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (p.group_name) { + _cleanup_free_ char *n = NULL; + const char *e, *f; + + e = startswith(p.group_name, "ns-"); + if (!e) + goto not_found; + + f = strrchr(e, '-'); + if (!f) + goto not_found; + + if (parse_gid(f+1, &offset) < 0) + goto not_found; + + n = strndup(e, f - e); + if (!n) + return log_oom(); + + r = userns_registry_load_by_name( + /* registry_fd= */ -EBADF, + n, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + + if (gid_is_valid(p.gid) && p.uid != userns_info->start + offset) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + } else if (gid_is_valid(p.gid)) { + gid_t start, gidmask; + + if (gid_is_container(p.gid)) + gidmask = (gid_t) UINT32_C(0xFFFF0000); + else if (gid_is_dynamic(p.gid)) + gidmask = (gid_t) UINT32_C(0xFFFFFFFF); + else + goto not_found; + + start = p.gid & gidmask; + offset = p.gid - start; + + r = userns_registry_load_by_start_uid( + /* registry_fd= */ -EBADF, + (uid_t) start, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + } else + return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL); + + r = build_group_json(userns_info, offset, &v); + if (r < 0) + return r; + + return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v)))); + +not_found: + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + LookupParameters p = {}; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.NamespaceResource")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + /* We don't support auxiliary groups for namespace allocations */ + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static int uid_is_available( + int registry_dir_fd, + uid_t candidate) { + + int r; + + assert(registry_dir_fd >= 0); + + log_debug("Checking if UID " UID_FMT " is available.", candidate); + + r = userns_registry_uid_exists(registry_dir_fd, candidate); + if (r < 0) + return r; + if (r > 0) + return false; + + r = userdb_by_uid(candidate, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + r = groupdb_by_gid(candidate, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + log_debug("UID " UID_FMT " is available.", candidate); + + return true; +} + +static int name_is_available( + int registry_dir_fd, + const char *name) { + + _cleanup_free_ char *user_name = NULL; + int r; + + assert(registry_dir_fd >= 0); + assert(name); + + r = userns_registry_name_exists(registry_dir_fd, name); + if (r < 0) + return r; + if (r > 0) + return false; + + user_name = strjoin("ns-", name, "-0"); + if (!user_name) + return -ENOMEM; + + r = userdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + r = groupdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + log_debug("Namespace name '%s' is available.", name); + + return true; +} + +static int allocate_now( + int registry_dir_fd, + UserNamespaceInfo *info, + int *ret_lock_fd) { + + static const uint8_t hash_key[16] = { + 0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd, + 0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee, + }; + + _cleanup_(uid_range_freep) UIDRange *valid_range = NULL; + uid_t candidate, uidmin, uidmax, uidmask; + unsigned n_tries = 100; + int r; + + /* Returns the following error codes: + * + * EBUSY → all UID candidates we checked are already taken + * EEXIST → the name for the userns already exists + * EDEADLK → the userns is already registered in the registry + */ + + assert(registry_dir_fd >= 0); + assert(info); + + switch (info->size) { + + case 0x10000U: + uidmin = CONTAINER_UID_BASE_MIN; + uidmax = CONTAINER_UID_BASE_MAX; + uidmask = (uid_t) UINT32_C(0xFFFF0000); + break; + + case 1U: + uidmin = DYNAMIC_UID_MIN; + uidmax = DYNAMIC_UID_MAX; + uidmask = (uid_t) UINT32_C(0xFFFFFFFF); + break; + + default: + assert_not_reached(); + } + + r = uid_range_load_userns(/* path= */ NULL, UID_RANGE_USERNS_INSIDE, &valid_range); + if (r < 0) + return r; + + /* Check early whether we have any chance at all given our own uid range */ + if (!uid_range_overlaps(valid_range, uidmin, uidmax)) + return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate."); + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + /* Enforce limit on user namespaces per UID */ + r = userns_registry_per_uid(registry_dir_fd, info->owner); + if (r < 0) + return log_debug_errno(r, "Failed to determine number of currently registered user namespaces per UID " UID_FMT ": %m", info->owner); + if (r >= USERNS_PER_UID) + return log_debug_errno(SYNTHETIC_ERRNO(EUSERS), "User already registered %i user namespaces, refusing.", r); + + r = userns_registry_inode_exists(registry_dir_fd, info->userns_inode); + if (r < 0) + return r; + if (r > 0) + return -EDEADLK; + + r = name_is_available(registry_dir_fd, info->name); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */ + candidate = random_u32()) { /* Use random values afterwards */ + + if (--n_tries <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available."); + + candidate = (candidate % (uidmax - uidmin)) + uidmin; + candidate &= uidmask; + + if (!uid_range_covers(valid_range, candidate, info->size)) + continue; + + /* We only check the base UID for each range (!) */ + r = uid_is_available(registry_dir_fd, candidate); + if (r < 0) + return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate); + if (r > 0) { + info->start = candidate; + + log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1); + + if (ret_lock_fd) + *ret_lock_fd = TAKE_FD(lock_fd); + + return 0; + } + + log_debug("UID range " UID_FMT " already taken.", candidate); + } +} + +static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) { + _cleanup_(sigkill_waitp) pid_t pid = 0; + _cleanup_close_ int efd = -EBADF; + uint64_t u; + int r; + + assert(usernsfd >= 0); + assert(userns_info); + assert(uid_is_valid(userns_info->target)); + assert(uid_is_valid(userns_info->start)); + assert(userns_info->size > 0); + assert(userns_info->size <= UINT32_MAX - userns_info->start); + + efd = eventfd(0, EFD_CLOEXEC); + if (efd < 0) + return log_error_errno(errno, "Failed to allocate eventfd(): %m"); + + r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* child */ + + if (setns(usernsfd, CLONE_NEWUSER) < 0) { + log_error_errno(errno, "Failed to join user namespace: %m"); + goto child_fail; + } + + if (eventfd_write(efd, 1) < 0) { + log_error_errno(errno, "Failed to ping event fd: %m"); + goto child_fail; + } + + freeze(); + + child_fail: + _exit(EXIT_FAILURE); + } + + /* Wait until child joined the user namespace */ + if (eventfd_read(efd, &u) < 0) + return log_error_errno(errno, "Failed to wait for event fd: %m"); + + /* Now write mapping */ + + _cleanup_free_ char *pmap = NULL; + + if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pid) < 0) + return log_oom(); + + r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " " UID_FMT "\n", userns_info->target, userns_info->start, userns_info->size); + if (r < 0) + return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m"); + + pmap = mfree(pmap); + if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pid) < 0) + return log_oom(); + + r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " " GID_FMT "\n", (gid_t) userns_info->target, (gid_t) userns_info->start, (gid_t) userns_info->size); + if (r < 0) + return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m"); + + /* We are done! */ + + log_debug("Successfully configured user namespace."); + return 0; +} + +static int test_userns_api_support(Varlink *link) { + int r; + + assert(link); + + /* We only expose the userns API if our manager daemon told us this OK to do. It will set this + * boolean only if it managed to set up BPF correctly for itself (i.e. watches for userns going away + * via BPF APIs). This should make very sure we don't accidentally allow any of the userns stuff to + * go through without the BPF LSM in effect. */ + + r = getenv_bool("NSRESOURCE_API"); + if (r < 0) + return log_error_errno(r, "Failed to parse $NSRESOURCE_API: %m"); + if (r == 0) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceInterfaceNotSupported", NULL); + + return 0; +} + +static int validate_name(Varlink *link, const char *name, char **ret) { + _cleanup_free_ char *un = NULL; + int r; + + assert(link); + assert(name); + assert(ret); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (peer_uid == 0) { + if (!userns_name_is_valid(name)) + return varlink_error_invalid_parameter_name(link, "name"); + + un = strdup(name); + if (!un) + return -ENOMEM; + } else { + /* The the client is not root then prefix the name with the UID of the peer, so that they + * live in separate namespaces and cannot steal each other's names. */ + + if (asprintf(&un, UID_FMT "-%s", peer_uid, name) < 0) + return -ENOMEM; + + if (!userns_name_is_valid(un)) + return varlink_error_invalid_parameter_name(link, "name"); + } + + *ret = TAKE_PTR(un); + return 0; +} + +static int validate_target_and_size(Varlink *link, unsigned target, unsigned size) { + assert(link); + + if (!IN_SET(size, 1U, 0x10000)) + return varlink_error_invalid_parameter_name(link, "size"); + + if (!uid_is_valid(target) || target > UINT32_MAX - size) + return varlink_error_invalid_parameter_name(link, "target"); + + return 0; +} + +static int validate_userns(Varlink *link, int userns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + + r = fd_verify_safe_flags(userns_fd); + if (r < 0) + return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m"); + + /* Validate this is actually a valid user namespace fd */ + r = fd_is_ns(userns_fd, CLONE_NEWUSER); + if (r < 0) + return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m"); + if (r == 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* And refuse the thing if it is our own */ + r = is_our_namespace(userns_fd, NAMESPACE_USER); + if (r < 0) + return log_debug_errno(r, "Failed to check if user namespace fd refers to our own user namespace: %m"); + if (r > 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return log_debug_errno(r, "Failed to acquire peer UID: %m"); + + if (peer_uid != 0) { + /* Refuse if the userns is not actually owned by our client. */ + uid_t owner_uid; + if (ioctl(userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0) + return log_debug_errno(errno, "Failed to get owner UID of user namespace: %m"); + + if (owner_uid != peer_uid) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + } + + return 0; +} + +static int validate_userns_is_empty(Varlink *link, int userns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + + _cleanup_(uid_range_freep) UIDRange *range = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns UID range: %m"); + + if (!uid_range_is_empty(range)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + range = uid_range_free(range); + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns GID range: %m"); + + if (!uid_range_is_empty(range)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + return 0; +} + +typedef struct AllocateParameters { + const char *name; + unsigned size; + unsigned target; + unsigned userns_fd_idx; +} AllocateParameters; + +static int vl_method_allocate_user_range(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "name", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AllocateParameters, name), JSON_MANDATORY }, + { "size", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, size), JSON_MANDATORY }, + { "target", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, target), 0 }, + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), JSON_MANDATORY }, + {} + }; + + struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata); + _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF, lock_fd = -EBADF; + _cleanup_free_ char *userns_name = NULL; + uid_t peer_uid; + struct stat userns_st; + AllocateParameters p = { + .size = UINT_MAX, + .userns_fd_idx = UINT_MAX, + }; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = validate_name(link, p.name, &userns_name); + if (r != 0) + return r; + + r = validate_target_and_size(link, p.target, p.size); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m"); + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + r = validate_userns_is_empty(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (!*bpf) { + r = userns_restrict_install(/* pin= */ true, bpf); + if (r < 0) + return r; + } + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new(); + if (!userns_info) + return -ENOMEM; + + userns_info->name = TAKE_PTR(userns_name); + if (!userns_info->name) + return -ENOMEM; + + userns_info->owner = peer_uid; + userns_info->userns_inode = userns_st.st_ino; + userns_info->size = p.size; + userns_info->target = p.target; + + r = allocate_now(registry_dir_fd, userns_info, &lock_fd); + if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */ + return varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL); + if (r == -EBUSY) /* All used up */ + return varlink_error(link, "io.systemd.NamespaceResource.NoDynamicRange", NULL); + if (r == -EDEADLK) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL); + if (r == -EEXIST) + return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL); + if (r < 0) + return r; + + r = userns_registry_store(registry_dir_fd, userns_info); + if (r < 0) + return r; + + /* Register the userns in the BPF map with an empty allowlist */ + r = userns_restrict_put_by_fd( + *bpf, + userns_fd, + /* replace= */ true, + /* mount_fds= */ NULL, + /* n_mount_fds= */ 0); + if (r < 0) + goto fail; + + r = write_userns(userns_fd, userns_info); + if (r < 0) + goto fail; + + lock_fd = safe_close(lock_fd); + + /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */ + r = sd_pid_notifyf_with_fds( + /* pid= */ 0, + /* unset_environment= */ false, + &userns_fd, 1, + "FDSTORE=1\n" + "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode); + if (r < 0) + goto fail; + + /* Note, we'll not return UID values from the host, since the child might not run in the same + * user namespace as us. If they want to know the ranges they should read them off the userns fd, so + * that they are translated into their PoV */ + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); + +fail: + /* Note: we don't have to clean-up the BPF maps in the error path: the bpf map type used will + * automatically do that once the userns inode goes away */ + userns_registry_remove(registry_dir_fd, userns_info); + return r; +} + +static int validate_userns_is_safe(Varlink *link, int userns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + + /* Read the outside UID range and verify it isn't empty */ + _cleanup_(uid_range_freep) UIDRange *outside_range = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &outside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns UID range: %m"); + if (uid_range_is_empty(outside_range)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* Read the outside GID range and check it is the same as the UID range */ + _cleanup_(uid_range_freep) UIDRange *outside_range_gid = NULL; + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &outside_range_gid); + if (r < 0) + return log_debug_errno(r, "Failed to read userns GID range: %m"); + if (!uid_range_equal(outside_range, outside_range_gid)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* Read the inside UID range, and verify it matches the size of the outside UID range */ + _cleanup_(uid_range_freep) UIDRange *inside_range = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &inside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns contents: %m"); + if (uid_range_size(outside_range) != uid_range_size(inside_range)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Uh, inside and outside UID range sizes don't match."); + + /* Read the inside GID range, and verify it matches the inside UID range */ + _cleanup_(uid_range_freep) UIDRange *inside_range_gid = NULL; + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &inside_range_gid); + if (r < 0) + return log_debug_errno(r, "Failed to read userns contents: %m"); + if (!uid_range_equal(inside_range, inside_range_gid)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + uid_t peer_gid; + r = varlink_get_peer_gid(link, &peer_gid); + if (r < 0) + return r; + + /* Insist that the first UID/GID in the range matches the client's UID/GID */ + if (outside_range->entries[0].start != peer_uid || + outside_range_gid->entries[0].start != peer_gid) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* If there are more than one UID in the range, then also insist that the first UID maps to root inside the userns */ + if (uid_range_size(outside_range) > 1 && inside_range->entries[0].start != 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + return 0; +} + +typedef struct RegisterParameters { + const char *name; + unsigned userns_fd_idx; +} RegisterParameters; + +static int vl_method_register_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "name", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(RegisterParameters, name), JSON_MANDATORY }, + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(RegisterParameters, userns_fd_idx), JSON_MANDATORY }, + {} + }; + + struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata); + _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF; + _cleanup_free_ char *userns_name = NULL; + uid_t peer_uid; + struct stat userns_st; + RegisterParameters p = { + .userns_fd_idx = UINT_MAX, + }; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = validate_name(link, p.name, &userns_name); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return userns_fd; + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + r = validate_userns_is_safe(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (!*bpf) { + r = userns_restrict_install(/* pin= */ true, bpf); + if (r < 0) + return r; + } + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + r = userns_registry_inode_exists(registry_dir_fd, userns_st.st_ino); + if (r < 0) + return r; + if (r > 0) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL); + + r = name_is_available(registry_dir_fd, userns_name); + if (r < 0) + return r; + if (r == 0) + return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL); + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new(); + if (!userns_info) + return -ENOMEM; + + userns_info->name = TAKE_PTR(userns_name); + if (!userns_info->name) + return -ENOMEM; + + userns_info->owner = peer_uid; + userns_info->userns_inode = userns_st.st_ino; + + r = userns_registry_store(registry_dir_fd, userns_info); + if (r < 0) + return log_debug_errno(r, "Failed to update userns registry: %m"); + + /* Register the userns in the BPF map with an empty allowlist */ + r = userns_restrict_put_by_fd( + *bpf, + userns_fd, + /* replace= */ true, + /* mount_fds= */ NULL, + /* n_mount_fds= */ 0); + if (r < 0) + goto fail; + + /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */ + r = sd_pid_notifyf_with_fds( + /* pid= */ 0, + /* unset_environment= */ false, + &userns_fd, 1, + "FDSTORE=1\n" + "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode); + if (r < 0) + goto fail; + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); + +fail: + userns_registry_remove(registry_dir_fd, userns_info); + return r; +} + +typedef struct AddMountParameters { + unsigned userns_fd_idx; + unsigned mount_fd_idx; +} AddMountParameters; + +static int vl_method_add_mount_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch parameter_dispatch_table[] = { + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, userns_fd_idx), JSON_MANDATORY }, + { "mountFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, mount_fd_idx), JSON_MANDATORY }, + {} + }; + + _cleanup_close_ int userns_fd = -EBADF, mount_fd = -EBADF, registry_dir_fd = -EBADF; + struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata); + AddMountParameters p = { + .userns_fd_idx = UINT_MAX, + .mount_fd_idx = UINT_MAX, + }; + int r, mnt_id = 0; + struct stat userns_st; + uid_t peer_uid; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + /* Allowlisting arbitrary mounts is a privileged operation */ + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + if (peer_uid != 0) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return userns_fd; + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return -errno; + + mount_fd = varlink_take_fd(link, p.mount_fd_idx); + if (mount_fd < 0) + return mount_fd; + + r = fd_verify_safe_flags_full(mount_fd, O_PATH|O_DIRECTORY); + if (r < 0) + return log_debug_errno(r, "Mount file descriptor has unsafe flags set: %m"); + + r = fd_verify_directory(mount_fd); + if (r < 0) + return r; + + r = path_get_mnt_id_at(mount_fd, NULL, &mnt_id); + if (r < 0) + return r; + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + r = userns_registry_load_by_userns_inode( + registry_dir_fd, + userns_st.st_ino, + &userns_info); + if (r == -ENOENT) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return r; + + if (!*bpf) { + r = userns_restrict_install(/* pin= */ true, bpf); + if (r < 0) + return r; + } + + /* Pin the mount fd */ + r = sd_pid_notifyf_with_fds( + /* pid= */ 0, + /* unset_environment= */ false, + &mount_fd, 1, + "FDSTORE=1\n" + "FDNAME=userns-" INO_FMT "\n", userns_st.st_ino); + if (r < 0) + return r; + + /* Add this mount to the user namespace's BPF map allowlist entry. */ + r = userns_restrict_put_by_fd( + *bpf, + userns_fd, + /* replace= */ false, + &mount_fd, + 1); + if (r < 0) + return r; + + if (userns_info->size > 0) + log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s' @ UID " UID_FMT ")", + mnt_id, userns_st.st_ino, userns_info->name, userns_info->start); + else + log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s')", + mnt_id, userns_st.st_ino, userns_info->name); + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); +} + +static int validate_cgroup(Varlink *link, int fd, uint64_t *ret_cgroup_id) { + int r; + + assert(link); + assert(fd >= 0); + assert(ret_cgroup_id); + + r = fd_verify_safe_flags_full(fd, O_DIRECTORY); + if (r < 0) + return log_debug_errno(r, "Control group file descriptor has unsafe flags set: %m"); + + r = fd_verify_directory(fd); + if (r < 0) + return log_debug_errno(r, "Verification that cgroup fd refers to directory failed: %m"); + + r = fd_is_fs_type(fd, CGROUP2_SUPER_MAGIC); + if (r < 0) + return log_debug_errno(r, "Failed to check if cgroup fd actually refers to cgroupfs: %m"); + if (r == 0) + return varlink_error_invalid_parameter_name(link, "controlGroupFileDescriptor"); + + r = cg_fd_get_cgroupid(fd, ret_cgroup_id); + if (r < 0) + return log_debug_errno(r, "Failed to read cgroup ID from cgroupfs: %m"); + + return 0; +} + +typedef struct AddCGroupParameters { + unsigned userns_fd_idx; + unsigned cgroup_fd_idx; +} AddCGroupParameters; + +static int vl_method_add_cgroup_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch parameter_dispatch_table[] = { + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, userns_fd_idx), JSON_MANDATORY }, + { "controlGroupFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, cgroup_fd_idx), JSON_MANDATORY }, + {} + }; + + _cleanup_close_ int userns_fd = -EBADF, cgroup_fd = -EBADF, registry_dir_fd = -EBADF; + AddCGroupParameters p = { + .userns_fd_idx = UINT_MAX, + .cgroup_fd_idx = UINT_MAX, + }; + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + struct stat userns_st, cgroup_st; + uid_t peer_uid; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m"); + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + + cgroup_fd = varlink_take_fd(link, p.cgroup_fd_idx); + if (cgroup_fd < 0) + return log_debug_errno(cgroup_fd, "Failed to take cgroup fd from Varlink connection: %m"); + + uint64_t cgroup_id; + r = validate_cgroup(link, cgroup_fd, &cgroup_id); + if (r != 0) + return r; + + if (fstat(cgroup_fd, &cgroup_st) < 0) + return log_debug_errno(errno, "Failed to fstat() cgroup fd: %m"); + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return lock_fd; + + r = userns_registry_load_by_userns_inode( + registry_dir_fd, + userns_st.st_ino, + &userns_info); + if (r == -ENOENT) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return r; + + /* The user namespace must have a user assigned */ + if (userns_info->size == 0) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceWithoutUserRange", NULL); + if (userns_info_has_cgroup(userns_info, cgroup_id)) + return varlink_error(link, "io.systemd.NamespaceResource.ControlGroupAlreadyAdded", NULL); + if (userns_info->n_cgroups > USER_NAMESPACE_CGROUPS_DELEGATE_MAX) + return varlink_error(link, "io.systemd.NamespaceResource.TooManyControlGroups", NULL); + + /* Registering a cgroup for this client is only allowed for the root or the owner of a userns */ + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return log_debug_errno(r, "Failed to get connection peer: %m"); + if (peer_uid != 0) { + if (peer_uid != userns_info->owner) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + /* The cgroup must be owned by the owner of the userns */ + if (cgroup_st.st_uid != userns_info->owner) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + } + + r = userns_info_add_cgroup(userns_info, cgroup_id); + if (r < 0) + return r; + + r = userns_registry_store(registry_dir_fd, userns_info); + if (r < 0) + return r; + + if (fchown(cgroup_fd, userns_info->start, userns_info->start) < 0) + return log_debug_errno(errno, "Failed to change ownership of cgroup: %m"); + + if (fchmod(cgroup_fd, 0755) < 0) + return log_debug_errno(errno, "Failed to change access mode of cgroup: %m"); + + FOREACH_STRING(attr, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") { + (void) fchmodat(cgroup_fd, attr, 0644, AT_SYMLINK_NOFOLLOW); + (void) fchownat(cgroup_fd, attr, userns_info->start, userns_info->start, AT_SYMLINK_NOFOLLOW); + } + + log_debug("Granting ownership to cgroup %" PRIu64 " to userns " INO_FMT " ('%s' @ UID " UID_FMT ")", + cgroup_id, userns_st.st_ino, userns_info->name, userns_info->start); + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); +} + +static uint64_t hash_ifname_id(UserNamespaceInfo *userns_info, const char *ifname) { + struct siphash state; + + assert(userns_info); + + siphash24_init(&state, (const uint8_t[]) { 0xc4, 0x6c, 0x96, 0xe8, 0xad, 0x37, 0x4d, 0x5f, 0xa1, 0xae, 0xfe, 0x70, 0x40, 0xed, 0x41, 0x5f }); + siphash24_compress_string(userns_info->name, &state); + siphash24_compress_byte(0, &state); /* separator */ + siphash24_compress_string(strempty(ifname), &state); + + return siphash24_finalize(&state); +} + +static void hash_ether_addr(UserNamespaceInfo *userns_info, const char *ifname, uint64_t n, struct ether_addr *ret) { + struct siphash state; + uint64_t h; + + assert(userns_info); + assert(ret); + + siphash24_init(&state, (const uint8_t[]) { 0x36, 0xaa, 0xd1, 0x69, 0xc7, 0xe5, 0x4c, 0xaa, 0x1e, 0xb2, 0x9e, 0xb3, 0x3a, 0x6b, 0xd4, 0x71 }); + siphash24_compress_string(userns_info->name, &state); + siphash24_compress_byte(0, &state); /* separator */ + siphash24_compress_string(strempty(ifname), &state); + siphash24_compress_byte(0, &state); /* separator */ + n = htole64(n); /* add the 'index' to the mix in an endianess-independent fashion */ + siphash24_compress(&n, sizeof(n), &state); + + h = htole64(siphash24_finalize(&state)); + + assert(sizeof(h) >= sizeof_field(struct ether_addr, ether_addr_octet)); + + memcpy(ret->ether_addr_octet, &h, sizeof_field(struct ether_addr, ether_addr_octet)); + ether_addr_mark_random(ret); +} + +static int create_veth( + int netns_fd, + const char *ifname_host, + const char *altifname_host, + struct ether_addr *mac_host, + const char *ifname_namespace, + struct ether_addr *mac_namespace) { + + int r; + + assert(netns_fd >= 0); + assert(ifname_host); + assert(mac_host); + assert(ifname_namespace); + assert(mac_namespace); + + log_debug("Creating veth link on host %s (%s) with address %s to container as %s with address %s", + ifname_host, strna(altifname_host), ETHER_ADDR_TO_STR(mac_host), + ifname_namespace, ETHER_ADDR_TO_STR(mac_namespace)); + + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + r = sd_netlink_open(&rtnl); + if (r < 0) + return r; + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container(m, VETH_INFO_PEER); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_namespace); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_namespace); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new veth interfaces (%s:%s): %m", ifname_host, ifname_namespace); + + r = rtnl_set_link_alternative_names_by_ifname(&rtnl, ifname_host, STRV_MAKE(altifname_host)); + if (r < 0) + log_warning_errno(r, "Failed to set alternative interface name to '%s', ignoring: %m", altifname_host); + + return 0; +} + +static int validate_netns(Varlink *link, int userns_fd, int netns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + assert(netns_fd >= 0); + + r = fd_verify_safe_flags(netns_fd); + if (r < 0) + return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m"); + + /* Validate this is actually a valid network namespace fd */ + r = fd_is_ns(netns_fd, CLONE_NEWNET); + if (r < 0) + return r; + if (r == 0) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + + /* And refuse the thing if it is our own */ + r = is_our_namespace(netns_fd, NAMESPACE_NET); + if (r < 0) + return r; + if (r > 0) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + + /* Check if the netns actually belongs to the userns */ + _cleanup_close_ int owner_userns_fd = -EBADF; + owner_userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_userns_fd < 0) + return -errno; + + r = inode_same_at(owner_userns_fd, /* path_a= */ NULL, userns_fd, /* path_b= */ NULL, AT_EMPTY_PATH); + if (r < 0) + return r; + if (r == 0) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (peer_uid != 0) { + /* Refuse if the netns is not actually owned by our client. */ + + uid_t owner_uid; + if (ioctl(owner_userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0) + return -errno; + + if (owner_uid != peer_uid) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + } + + return 0; +} + +typedef struct AddNetworkParameters { + unsigned userns_fd_idx; + unsigned netns_fd_idx; + const char *ifname; + const char *mode; +} AddNetworkParameters; + +static int vl_method_add_netif_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch parameter_dispatch_table[] = { + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddNetworkParameters, userns_fd_idx), JSON_MANDATORY }, + { "networkNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddNetworkParameters, netns_fd_idx), JSON_MANDATORY }, + { "namespaceInterfaceName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AddNetworkParameters, ifname), 0 }, + { "mode", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AddNetworkParameters, mode), JSON_MANDATORY }, + {} + }; + + _cleanup_close_ int userns_fd = -EBADF, netns_fd = -EBADF, registry_dir_fd = -EBADF; + AddNetworkParameters p = { + .userns_fd_idx = UINT_MAX, + }; + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + struct stat userns_st; + uid_t peer_uid; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return userns_fd; + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return -errno; + + netns_fd = varlink_take_fd(link, p.netns_fd_idx); + if (netns_fd < 0) + return netns_fd; + + r = validate_netns(link, userns_fd, netns_fd); + if (r != 0) + return r; + + if (!streq_ptr(p.mode, "veth")) + return varlink_error_invalid_parameter_name(link, "mode"); + + if (p.ifname && !ifname_valid(p.ifname)) + return varlink_error_invalid_parameter_name(link, "interfaceName"); + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + r = userns_registry_load_by_userns_inode( + registry_dir_fd, + userns_st.st_ino, + &userns_info); + if (r == -ENOENT) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return r; + + /* Registering a network interface for this client is only allowed for the root or the owner of a userns */ + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + if (peer_uid != 0 && peer_uid != userns_info->owner) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + _cleanup_free_ char *ifname_host = NULL, *altifname_host = NULL; + const char *ifname_namespace = p.ifname ?: "host0"; + + /* The short ifname is just too short to generate readable and unique names where unprivileged users + * can't take each others names. Hence just hash it. The alternative name however contains more useful + * information. */ + if (asprintf(&ifname_host, "ns-%08" PRIx64, hash_ifname_id(userns_info, p.ifname)) < 0) + return -ENOMEM; + strshorten(ifname_host, IFNAMSIZ-1); + + if (p.ifname) + r = asprintf(&altifname_host, "ns-" UID_FMT "-%s-%s", userns_info->owner, userns_info->name, p.ifname); + else + r = asprintf(&altifname_host, "ns-" UID_FMT "-%s", userns_info->owner, userns_info->name); + if (r < 0) + return -ENOMEM; + + struct ether_addr ether_addr_host, ether_addr_namespace; + + hash_ether_addr(userns_info, p.ifname, 0, ðer_addr_host); + hash_ether_addr(userns_info, p.ifname, 1, ðer_addr_namespace); + + r = create_veth(netns_fd, + ifname_host, altifname_host, ðer_addr_host, + ifname_namespace, ðer_addr_namespace); + if (r < 0) + return r; + + log_debug("Adding veth tunnel %s from host to userns " INO_FMT " ('%s' @ UID " UID_FMT ", interface %s).", + ifname_host, userns_st.st_ino, userns_info->name, userns_info->start, ifname_namespace); + + return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hostInterfaceName", JSON_BUILD_STRING(ifname_host)), + JSON_BUILD_PAIR("namespaceInterfaceName", JSON_BUILD_STRING(ifname_namespace)))); +} + +static int process_connection(VarlinkServer *server, int _fd) { + _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */ + _cleanup_(varlink_close_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_server_add_connection(server, fd, &vl); + if (r < 0) + return log_error_errno(r, "Failed to add connection: %m"); + + TAKE_FD(fd); + vl = varlink_ref(vl); + + r = varlink_set_allow_fd_passing_input(vl, true); + if (r < 0) + return log_error_errno(r, "Failed to enable fd passing for read: %m"); + + r = varlink_set_allow_fd_passing_output(vl, true); + if (r < 0) + return log_error_errno(r, "Failed to enable fd passing for write: %m"); + + for (;;) { + r = varlink_process(vl); + if (r == -ENOTCONN) { + log_debug("Connection terminated."); + break; + } + if (r < 0) + return log_error_errno(r, "Failed to process connection: %m"); + if (r > 0) + continue; + + r = varlink_wait(vl, CONNECTION_IDLE_USEC); + if (r < 0) + return log_error_errno(r, "Failed to wait for connection events: %m"); + if (r == 0) + break; + } + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *bpf = NULL; + usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY; + _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL; + _cleanup_(pidref_done) PidRef parent = PIDREF_NULL; + unsigned n_iterations = 0; + int m, listen_fd, r; + + log_setup(); + + m = sd_listen_fds(false); + if (m < 0) + return log_error_errno(m, "Failed to determine number of listening fds: %m"); + if (m == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received."); + if (m > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time."); + + listen_fd = SD_LISTEN_FDS_START; + + r = fd_nonblock(listen_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m"); + + r = varlink_server_new(&server, VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_error_errno(r, "Failed to allocate server: %m"); + + r = varlink_server_add_interface_many( + server, + &vl_interface_io_systemd_NamespaceResource, + &vl_interface_io_systemd_UserDatabase); + if (r < 0) + return log_error_errno(r, "Failed to add UserDatabase and NamespaceResource interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + server, + "io.systemd.NamespaceResource.AllocateUserRange", vl_method_allocate_user_range, + "io.systemd.NamespaceResource.RegisterUserNamespace", vl_method_register_user_namespace, + "io.systemd.NamespaceResource.AddMountToUserNamespace", vl_method_add_mount_to_user_namespace, + "io.systemd.NamespaceResource.AddControlGroupToUserNamespace", vl_method_add_cgroup_to_user_namespace, + "io.systemd.NamespaceResource.AddNetworkToUserNamespace", vl_method_add_netif_to_user_namespace, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships); + if (r < 0) + return log_error_errno(r, "Failed to bind methods: %m"); + + varlink_server_set_userdata(server, &bpf); + + r = getenv_bool("NSRESOURCE_FIXED_WORKER"); + if (r < 0) + return log_error_errno(r, "Failed to parse NSRESOURCE_FIXED_WORKER: %m"); + listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC; + + r = pidref_set_parent(&parent); + if (r < 0) + return log_error_errno(r, "Failed to acquire pidfd of parent process: %m"); + + start_time = now(CLOCK_MONOTONIC); + + for (;;) { + _cleanup_close_ int fd = -EBADF; + usec_t n; + + /* Exit the worker in regular intervals, to flush out all memory use */ + if (n_iterations++ > ITERATIONS_MAX) { + log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations); + break; + } + + n = now(CLOCK_MONOTONIC); + if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) { + log_debug("Exiting worker, ran for %s, that's enough.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0)); + break; + } + + if (last_busy_usec == USEC_INFINITY) + last_busy_usec = n; + else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) { + log_debug("Exiting worker, been idle for %s.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0)); + break; + } + + (void) rename_process("systemd-nsresourcework: waiting..."); + fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC)); + (void) rename_process("systemd-nsresourcework: processing..."); + + if (fd == -EAGAIN) + continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected + * after a while, let's check if it's time to exit though. */ + if (fd == -EINTR) + continue; /* Might be that somebody attached via strace, let's just continue in that + * case */ + if (fd < 0) + return log_error_errno(fd, "Failed to accept() from listening socket: %m"); + + if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) { + /* We only slept a very short time? If so, let's see if there are more sockets + * pending, and if so, let's ask our parent for more workers */ + + r = fd_wait_for_event(listen_fd, POLLIN, 0); + if (r < 0) + return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m"); + + if (FLAGS_SET(r, POLLIN)) { + r = pidref_kill(&parent, SIGUSR2); + if (r == -ESRCH) + return log_error_errno(r, "Parent already died?"); + if (r < 0) + return log_error_errno(r, "Failed to send SIGUSR2 signal to parent. %m"); + } + } + + (void) process_connection(server, TAKE_FD(fd)); + last_busy_usec = USEC_INFINITY; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/nsresourced/test-userns-restrict.c b/src/nsresourced/test-userns-restrict.c new file mode 100644 index 0000000..f509321 --- /dev/null +++ b/src/nsresourced/test-userns-restrict.c @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/eventfd.h> + +#include "fd-util.h" +#include "main-func.h" +#include "missing_mount.h" +#include "missing_syscall.h" +#include "namespace-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "tmpfile-util.h" +#include "userns-restrict.h" + +static int make_tmpfs_fsmount(void) { + _cleanup_close_ int fsfd = -EBADF, mntfd = -EBADF; + + fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + assert_se(fsfd >= 0); + assert_se(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) >= 0); + + mntfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + assert_se(mntfd >= 0); + + return TAKE_FD(mntfd); +} + +static void test_works_reg(int parent_fd, const char *fname) { + _cleanup_close_ int fd = -EBADF; + + fd = openat(parent_fd, fname, O_RDWR|O_CREAT|O_CLOEXEC, 0666); + assert_se(fd >= 0); +} + +static void test_fails_reg(int parent_fd, const char *fname) { + errno = 0; + assert_se(openat(parent_fd, fname, O_RDWR|O_CREAT|O_CLOEXEC, 0666) < 0); + assert_se(errno == EPERM); +} + +static void test_works_dir(int parent_fd, const char *fname) { + assert_se(mkdirat(parent_fd, fname, 0666) >= 0); +} + +static void test_fails_dir(int parent_fd, const char *fname) { + errno = 0; + assert_se(mkdirat(parent_fd, fname, 0666) < 0); + assert_se(errno == EPERM); +} + +static int run(int argc, char *argv[]) { + _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL; + _cleanup_close_ int userns_fd = -EBADF, host_fd1 = -EBADF, host_tmpfs = -EBADF, afd = -EBADF, bfd = -EBADF; + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_(sigkill_waitp) pid_t pid = 0; + int r; + + log_set_max_level(LOG_DEBUG); + log_setup(); + + r = userns_restrict_install(/* pin= */ false, &obj); + if (ERRNO_IS_NOT_SUPPORTED(r)) { + log_notice("Skipping test, LSM-BPF logic not supported."); + return EXIT_TEST_SKIP; + } + if (ERRNO_IS_PRIVILEGE(r)) { + log_notice("Skipping test, lacking privileges."); + return EXIT_TEST_SKIP; + } + if (r < 0) + return r; + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + + host_fd1 = open(t, O_DIRECTORY|O_CLOEXEC); + assert_se(host_fd1 >= 0); + + host_tmpfs = make_tmpfs_fsmount(); + assert_se(host_tmpfs >= 0); + + userns_fd = userns_acquire("0 0 1", "0 0 1"); + if (userns_fd < 0) + return log_error_errno(userns_fd, "Failed to make user namespace: %m"); + + r = userns_restrict_put_by_fd( + obj, + userns_fd, + /* replace= */ true, + /* mount_fds= */ NULL, + /* n_mount_fds= */ 0); + if (r < 0) + return log_error_errno(r, "Failed to restrict user namespace: %m"); + + afd = eventfd(0, EFD_CLOEXEC); + bfd = eventfd(0, EFD_CLOEXEC); + + assert_se(afd >= 0 && bfd >= 0); + + r = safe_fork("(test)", FORK_DEATHSIG_SIGKILL, &pid); + assert_se(r >= 0); + if (r == 0) { + _cleanup_close_ int private_tmpfs = -EBADF; + + assert_se(setns(userns_fd, CLONE_NEWUSER) >= 0); + assert_se(unshare(CLONE_NEWNS) >= 0); + + /* Allocate tmpfs locally */ + private_tmpfs = make_tmpfs_fsmount(); + + /* These two host mounts should be inaccessible */ + test_fails_reg(host_fd1, "test"); + test_fails_reg(host_tmpfs, "xxx"); + test_fails_dir(host_fd1, "test2"); + test_fails_dir(host_tmpfs, "xxx2"); + + /* But this mount created locally should be fine */ + test_works_reg(private_tmpfs, "yyy"); + test_works_dir(private_tmpfs, "yyy2"); + + /* Let's sync with the parent, so that it allowlists more stuff for us */ + assert_se(eventfd_write(afd, 1) >= 0); + uint64_t x; + assert_se(eventfd_read(bfd, &x) >= 0); + + /* And now we should also have access to the host tmpfs */ + test_works_reg(host_tmpfs, "zzz"); + test_works_reg(private_tmpfs, "aaa"); + test_works_dir(host_tmpfs, "zzz2"); + test_works_dir(private_tmpfs, "aaa2"); + + /* But this one should still fail */ + test_fails_reg(host_fd1, "bbb"); + test_fails_dir(host_fd1, "bbb2"); + + /* Sync again, to get more stuff allowlisted */ + assert_se(eventfd_write(afd, 1) >= 0); + assert_se(eventfd_read(bfd, &x) >= 0); + + /* Everything should now be allowed */ + test_works_reg(host_tmpfs, "ccc"); + test_works_reg(host_fd1, "ddd"); + test_works_reg(private_tmpfs, "eee"); + test_works_dir(host_tmpfs, "ccc2"); + test_works_reg(host_fd1, "ddd2"); + test_works_dir(private_tmpfs, "eee2"); + + _exit(EXIT_SUCCESS); + } + + uint64_t x; + assert_se(eventfd_read(afd, &x) >= 0); + + r = userns_restrict_put_by_fd( + obj, + userns_fd, + /* replace= */ false, + &host_tmpfs, + 1); + if (r < 0) + return log_error_errno(r, "Failed to loosen user namespace: %m"); + + assert_se(eventfd_write(bfd, 1) >= 0); + + assert_se(eventfd_read(afd, &x) >= 0); + + r = userns_restrict_put_by_fd( + obj, + userns_fd, + /* replace= */ false, + &host_fd1, + 1); + if (r < 0) + return log_error_errno(r, "Failed to loosen user namespace: %m"); + + assert_se(eventfd_write(bfd, 1) >= 0); + + assert_se(wait_for_terminate_and_check("(test)", pid, WAIT_LOG) >= 0); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c new file mode 100644 index 0000000..2cc1b1f --- /dev/null +++ b/src/nsresourced/userns-registry.c @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "chase.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "json.h" +#include "missing_magic.h" +#include "path-util.h" +#include "recurse-dir.h" +#include "rm-rf.h" +#include "user-util.h" +#include "userns-registry.h" + +int userns_registry_open_fd(void) { + int fd; + + fd = chase_and_open( + "/run/systemd/nsresource/registry", + /* root= */ NULL, + CHASE_MKDIR_0755, + O_CLOEXEC|O_DIRECTORY|O_CREAT, + /* ret_path= */ NULL); + if (fd < 0) + return log_debug_errno(fd, "Failed to open registry dir: %m"); + + return fd; +} + +int userns_registry_lock(int dir_fd) { + _cleanup_close_ int registry_fd = -EBADF, lock_fd = -EBADF; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + lock_fd = xopenat_lock_full(dir_fd, "lock", O_CREAT|O_RDWR|O_CLOEXEC, /* xopen_flags= */ 0, 0600, LOCK_BSD, LOCK_EX); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + return TAKE_FD(lock_fd); +} + +UserNamespaceInfo* userns_info_new(void) { + UserNamespaceInfo *info = new(UserNamespaceInfo, 1); + if (!info) + return NULL; + + *info = (UserNamespaceInfo) { + .owner = UID_INVALID, + .start = UID_INVALID, + .target = UID_INVALID, + }; + + return info; +} + +UserNamespaceInfo *userns_info_free(UserNamespaceInfo *userns) { + if (!userns) + return NULL; + + free(userns->cgroups); + free(userns->name); + + return mfree(userns); +} + +static int dispatch_cgroups_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserNamespaceInfo *info = ASSERT_PTR(userdata); + _cleanup_free_ uint64_t *cgroups = NULL; + size_t n_cgroups = 0; + + if (json_variant_is_null(variant)) { + info->cgroups = mfree(info->cgroups); + info->n_cgroups = 0; + return 0; + } + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + cgroups = new(uint64_t, json_variant_elements(variant)); + if (!cgroups) + return json_log_oom(variant, flags); + + JsonVariant *e; + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + bool found = false; + + if (!json_variant_is_unsigned(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a number."); + + FOREACH_ARRAY(cg, cgroups, n_cgroups) + if (*cg == json_variant_unsigned(e)) { + found = true; + break; + } + if (found) /* suppress duplicate */ + continue; + + cgroups[n_cgroups++] = json_variant_unsigned(e); + } + + assert(n_cgroups <= json_variant_elements(variant)); + + free_and_replace(info->cgroups, cgroups); + info->n_cgroups = n_cgroups; + + return 0; +} + +static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **ret) { + + static const JsonDispatch dispatch_table[] = { + { "owner", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserNamespaceInfo, owner), JSON_MANDATORY }, + { "name", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserNamespaceInfo, name), JSON_MANDATORY }, + { "userns", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(UserNamespaceInfo, userns_inode), JSON_MANDATORY }, + { "start", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserNamespaceInfo, start), 0 }, + { "size", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(UserNamespaceInfo, size), 0 }, + { "target", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserNamespaceInfo, target), 0 }, + { "cgroups", JSON_VARIANT_ARRAY, dispatch_cgroups_array, 0, 0 }, + {} + }; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_close_ int registry_fd = -EBADF; + int r; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + r = json_parse_file_at(NULL, dir_fd, fn, 0, &v, NULL, NULL); + if (r < 0) + return r; + + userns_info = userns_info_new(); + if (!userns_info) + return -ENOMEM; + + r = json_dispatch(v, dispatch_table, 0, userns_info); + if (r < 0) + return r; + + if (userns_info->userns_inode == 0) + return -EBADMSG; + if (userns_info->start == 0) + return -EBADMSG; + if (userns_info->size == 0) { + if (uid_is_valid(userns_info->start) || uid_is_valid(userns_info->target)) + return -EBADMSG; + } else { + if (!uid_is_valid(userns_info->start) || !uid_is_valid(userns_info->target)) + return -EBADMSG; + + if (userns_info->size > UINT32_MAX - userns_info->start || + userns_info->size > UINT32_MAX - userns_info->target) + return -EBADMSG; + } + + if (ret) + *ret = TAKE_PTR(userns_info); + return 0; +} + +int userns_registry_uid_exists(int dir_fd, uid_t start) { + _cleanup_free_ char *fn = NULL; + + assert(dir_fd >= 0); + + if (!uid_is_valid(start)) + return -ENOENT; + + if (start == 0) + return true; + + if (asprintf(&fn, "u" UID_FMT ".userns", start) < 0) + return -ENOMEM; + + if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + return errno == ENOENT ? false : -errno; + + return true; +} + +int userns_registry_name_exists(int dir_fd, const char *name) { + _cleanup_free_ char *fn = NULL; + + assert(dir_fd >= 0); + + if (!userns_name_is_valid(name)) + return -EINVAL; + + fn = strjoin("n", name, ".userns"); + if (!fn) + return -ENOMEM; + + if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + return errno == ENOENT ? false : -errno; + + return true; +} + +int userns_registry_inode_exists(int dir_fd, uint64_t inode) { + _cleanup_free_ char *fn = NULL; + + assert(dir_fd >= 0); + + if (inode <= 0) + return -EINVAL; + + if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0) + return -ENOMEM; + + if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + return errno == ENOENT ? false : -errno; + + return true; +} + +int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret) { + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_close_ int registry_fd = -EBADF; + _cleanup_free_ char *fn = NULL; + int r; + + if (!uid_is_valid(start)) + return -ENOENT; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + if (asprintf(&fn, "u" UID_FMT ".userns", start) < 0) + return -ENOMEM; + + r = userns_registry_load(dir_fd, fn, &userns_info); + if (r < 0) + return r; + + if (userns_info->start != start) + return -EBADMSG; + + if (ret) + *ret = TAKE_PTR(userns_info); + + return 0; +} + +int userns_registry_load_by_userns_inode(int dir_fd, uint64_t inode, UserNamespaceInfo **ret) { + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_close_ int registry_fd = -EBADF; + _cleanup_free_ char *fn = NULL; + int r; + + if (inode == 0) + return -ENOENT; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0) + return -ENOMEM; + + r = userns_registry_load(dir_fd, fn, &userns_info); + if (r < 0) + return r; + + if (userns_info->userns_inode != inode) + return -EBADMSG; + + if (ret) + *ret = TAKE_PTR(userns_info); + + return 0; +} + +int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret) { + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_close_ int registry_fd = -EBADF; + _cleanup_free_ char *fn = NULL; + int r; + + assert(name); + + if (!userns_name_is_valid(name)) /* Invalid names never exist */ + return -ENOENT; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + fn = strjoin("n", name, ".userns"); + if (!fn) + return -ENOMEM; + + r = userns_registry_load(dir_fd, fn, &userns_info); + if (r < 0) + return r; + + if (!streq_ptr(userns_info->name, name)) + return -EBADMSG; + + if (ret) + *ret = TAKE_PTR(userns_info); + + return 0; +} + +int userns_registry_store(int dir_fd, UserNamespaceInfo *info) { + _cleanup_close_ int registry_fd = -EBADF; + int r; + + assert(info); + + if (!uid_is_valid(info->owner) || + !info->name || + info->userns_inode == 0) + return -EINVAL; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + _cleanup_(json_variant_unrefp) JsonVariant *cgroup_array = NULL; + FOREACH_ARRAY(cg, info->cgroups, info->n_cgroups) { + r = json_variant_append_arrayb( + &cgroup_array, + JSON_BUILD_UNSIGNED(*cg)); + if (r < 0) + return r; + } + + _cleanup_(json_variant_unrefp) JsonVariant *def = NULL; + r = json_build(&def, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("owner", JSON_BUILD_UNSIGNED(info->owner)), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(info->name)), + JSON_BUILD_PAIR("userns", JSON_BUILD_UNSIGNED(info->userns_inode)), + JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "start", JSON_BUILD_UNSIGNED(info->start)), + JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "size", JSON_BUILD_UNSIGNED(info->size)), + JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "target", JSON_BUILD_UNSIGNED(info->target)), + JSON_BUILD_PAIR_CONDITION(cgroup_array, "cgroups", JSON_BUILD_VARIANT(cgroup_array)))); + if (r < 0) + return r; + + _cleanup_free_ char *def_buf = NULL; + r = json_variant_format(def, 0, &def_buf); + if (r < 0) + return log_debug_errno(r, "Failed to format userns JSON object: %m"); + + _cleanup_free_ char *reg_fn = NULL, *link1_fn = NULL, *link2_fn = NULL, *owner_fn = NULL, *uid_fn = NULL; + if (asprintf(®_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0) + return log_oom_debug(); + + r = write_string_file_at(dir_fd, reg_fn, def_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC); + if (r < 0) + return log_debug_errno(r, "Failed to write userns data to '%s' in registry: %m", reg_fn); + + link1_fn = strjoin("n", info->name, ".userns"); + if (!link1_fn) { + r = log_oom_debug(); + goto fail; + } + + r = linkat_replace(dir_fd, reg_fn, dir_fd, link1_fn); + if (r < 0) { + log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", link1_fn); + goto fail; + } + + if (uid_is_valid(info->start)) { + if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start) < 0) { + r = log_oom_debug(); + goto fail; + } + + r = linkat_replace(dir_fd, reg_fn, dir_fd, link2_fn); + if (r < 0) { + log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", link2_fn); + goto fail; + } + } + + if (asprintf(&uid_fn, "o" UID_FMT ".owns", info->owner) < 0) { + r = log_oom_debug(); + goto fail; + } + + if (mkdirat(dir_fd, uid_fn, 0755) < 0 && errno != EEXIST) { + r = log_debug_errno(errno, "Failed to create per-UID subdir '%s' of registry: %m", uid_fn); + goto fail; + } + + if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0) { + r = log_oom_debug(); + goto fail; + } + + r = linkat_replace(dir_fd, reg_fn, dir_fd, owner_fn); + if (r < 0) { + log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", owner_fn); + goto fail; + } + + return 0; + +fail: + if (reg_fn) + (void) unlinkat(dir_fd, reg_fn, /* flags= */ 0); + if (link1_fn) + (void) unlinkat(dir_fd, link1_fn, /* flags= */ 0); + if (link2_fn) + (void) unlinkat(dir_fd, link2_fn, /* flags= */ 0); + if (owner_fn) + (void) unlinkat(dir_fd, owner_fn, /* flags= */ 0); + if (uid_fn) + (void) unlinkat(dir_fd, uid_fn, AT_REMOVEDIR); + + return r; +} + +int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) { + _cleanup_close_ int registry_fd = -EBADF; + int ret = 0, r; + + assert(info); + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + _cleanup_free_ char *reg_fn = NULL; + if (asprintf(®_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0) + return log_oom_debug(); + + ret = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0)); + + _cleanup_free_ char *link1_fn = NULL; + link1_fn = strjoin("n", info->name, ".userns"); + if (!link1_fn) + return log_oom_debug(); + + RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link1_fn, 0))); + + if (uid_is_valid(info->start)) { + _cleanup_free_ char *link2_fn = NULL; + + if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start) < 0) + return log_oom_debug(); + + RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link2_fn, 0))); + } + + _cleanup_free_ char *uid_fn = NULL; + if (asprintf(&uid_fn, "o" UID_FMT ".owns", info->owner) < 0) + return log_oom_debug(); + + _cleanup_free_ char *owner_fn = NULL; + if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0) + return log_oom_debug(); + + RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, owner_fn, 0))); + + r = RET_NERRNO(unlinkat(dir_fd, uid_fn, AT_REMOVEDIR)); + if (r != -ENOTEMPTY) + RET_GATHER(ret, r); + + return ret; +} + +bool userns_info_has_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id) { + assert(userns); + + FOREACH_ARRAY(i, userns->cgroups, userns->n_cgroups) + if (*i == cgroup_id) + return true; + + return false; +} + +int userns_info_add_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id) { + + if (userns_info_has_cgroup(userns, cgroup_id)) + return 0; + + if (!GREEDY_REALLOC(userns->cgroups, userns->n_cgroups+1)) + return -ENOMEM; + + userns->cgroups[userns->n_cgroups++] = cgroup_id; + return 1; +} + +static int userns_destroy_cgroup(uint64_t cgroup_id) { + _cleanup_close_ int cgroup_fd = -EBADF, parent_fd = -EBADF; + int r; + + cgroup_fd = cg_cgroupid_open(/* cgroupfsfd= */ -EBADF, cgroup_id); + if (cgroup_fd == -ESTALE) { + log_debug_errno(cgroup_fd, "Control group %" PRIu64 " already gone, ignoring: %m", cgroup_id); + return 0; + } + if (cgroup_fd < 0) + return log_debug_errno(errno, "Failed to open cgroup %" PRIu64 ", ignoring: %m", cgroup_id); + + _cleanup_free_ char *path = NULL; + r = fd_get_path(cgroup_fd, &path); + if (r < 0) + return log_debug_errno(r, "Failed to get path of cgroup %" PRIu64 ", ignoring: %m", cgroup_id); + + const char *e = path_startswith(path, "/sys/fs/cgroup/"); + if (!e) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Got cgroup path that doesn't start with /sys/fs/cgroup/, refusing: %s", path); + if (isempty(e)) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Got root cgroup path, which can't be right, refusing."); + + log_debug("Path of cgroup %" PRIu64 " is: %s", cgroup_id, path); + + _cleanup_free_ char *fname = NULL; + r = path_extract_filename(path, &fname); + if (r < 0) + return log_debug_errno(r, "Failed to extract name of cgroup %" PRIu64 ", ignoring: %m", cgroup_id); + + parent_fd = openat(cgroup_fd, "..", O_CLOEXEC|O_DIRECTORY); + if (parent_fd < 0) + return log_debug_errno(errno, "Failed to open parent cgroup of %" PRIu64 ", ignoring: %m", cgroup_id); + + /* Safety check, never leave cgroupfs */ + r = fd_is_fs_type(parent_fd, CGROUP2_SUPER_MAGIC); + if (r < 0) + return log_debug_errno(r, "Failed to determine if parent directory of cgroup %" PRIu64 " is still a cgroup, ignoring: %m", cgroup_id); + if (!r) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Parent directory of cgroup %" PRIu64 " is not a cgroup, refusing.", cgroup_id); + + cgroup_fd = safe_close(cgroup_fd); + + r = rm_rf_child(parent_fd, fname, REMOVE_ONLY_DIRECTORIES|REMOVE_PHYSICAL|REMOVE_CHMOD); + if (r < 0) + log_debug_errno(r, "Failed to remove delegated cgroup %" PRIu64 ", ignoring: %m", cgroup_id); + + return 0; +} + +int userns_info_remove_cgroups(UserNamespaceInfo *userns) { + int ret = 0; + + assert(userns); + + FOREACH_ARRAY(c, userns->cgroups, userns->n_cgroups) + RET_GATHER(ret, userns_destroy_cgroup(*c)); + + userns->cgroups = mfree(userns->cgroups); + userns->n_cgroups = 0; + + return ret; +} + +bool userns_name_is_valid(const char *name) { + + /* Checks if the specified string is suitable as user namespace name. */ + + if (strlen(name) > NAME_MAX) /* before we use alloca(), let's check for size */ + return false; + + const char *f = strjoina("n", name, ".userns"); /* Make sure we can name our lookup symlink with this name */ + if (!filename_is_valid(f)) + return false; + + const char *u = strjoina("ns-", name, "-65535"); /* Make sure we can turn this into valid user names */ + if (!valid_user_group_name(u, 0)) + return false; + + return true; +} + +int userns_registry_per_uid(int dir_fd, uid_t owner) { + _cleanup_close_ int registry_fd = -EBADF; + int n = 0, r; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + _cleanup_free_ char *uid_fn = NULL; + if (asprintf(&uid_fn, "o" UID_FMT ".owns", owner) < 0) + return log_oom_debug(); + + _cleanup_free_ DirectoryEntries *de = NULL; + + r = readdir_all_at(dir_fd, uid_fn, RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE, &de); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to enumerate contents of '%s' sub-directory: %m", uid_fn); + + FOREACH_ARRAY(i, de->entries, de->n_entries) { + struct dirent *e = *i; + + if (e->d_type != DT_REG) + continue; + + if (!startswith(e->d_name, "i") || !endswith(e->d_name, ".userns")) + continue; + + n++; + + if (n == INT_MAX) /* overflow safety check, just in case */ + break; + } + + return n; +} diff --git a/src/nsresourced/userns-registry.h b/src/nsresourced/userns-registry.h new file mode 100644 index 0000000..9e66a6f --- /dev/null +++ b/src/nsresourced/userns-registry.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16 + +typedef struct UserNamespaceInfo { + uid_t owner; + char *name; + uint64_t userns_inode; + uid_t start; + uint32_t size; + uid_t target; + uint64_t *cgroups; + size_t n_cgroups; +} UserNamespaceInfo; + +UserNamespaceInfo* userns_info_new(void); +UserNamespaceInfo* userns_info_free(UserNamespaceInfo *userns); + +DEFINE_TRIVIAL_CLEANUP_FUNC(UserNamespaceInfo*, userns_info_free); + +bool userns_info_has_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id); +int userns_info_add_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id); +int userns_info_remove_cgroups(UserNamespaceInfo *userns); + +bool userns_name_is_valid(const char *name); + +int userns_registry_open_fd(void); +int userns_registry_lock(int dir_fd); + +int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret); +int userns_registry_load_by_userns_inode(int dir_fd, uint64_t userns, UserNamespaceInfo **ret); +int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret); + +int userns_registry_store(int dir_fd, UserNamespaceInfo *info); +int userns_registry_remove(int dir_fd, UserNamespaceInfo *info); + +int userns_registry_inode_exists(int dir_fd, uint64_t inode); +int userns_registry_name_exists(int dir_fd, const char *name); +int userns_registry_uid_exists(int dir_fd, uid_t start); + +int userns_registry_per_uid(int dir_fd, uid_t owner); diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c new file mode 100644 index 0000000..be33f49 --- /dev/null +++ b/src/nsresourced/userns-restrict.c @@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "userns-restrict.h" + +#if HAVE_VMLINUX_H + +#include <sched.h> + +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "fd-util.h" +#include "fs-util.h" +#include "lsm-util.h" +#include "missing_mount.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "path-util.h" + +#define USERNS_MAX (16U*1024U) +#define MOUNTS_MAX 4096U + +#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs" +#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps" + +struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) { + (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */ + return NULL; +} + +static int make_inner_hash_map(void) { + int fd; + + fd = compat_bpf_map_create( + BPF_MAP_TYPE_HASH, + NULL, + sizeof(int), + sizeof(uint32_t), + MOUNTS_MAX, + NULL); + if (fd < 0) + return log_debug_errno(errno, "Failed allocate inner BPF map: %m"); + + return fd; +} + +int userns_restrict_install( + bool pin, + struct userns_restrict_bpf **ret) { + + _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL; + _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF; + int r; + + r = lsm_supported("bpf"); + if (r < 0) + return r; + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace."); + + r = dlopen_bpf(); + if (r < 0) + return r; + + /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */ + if (!sym_bpf_object__next_map) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace."); + + obj = userns_restrict_bpf__open(); + if (!obj) + return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m"); + + if (pin) { + struct bpf_map *map; + + /* libbpf will only create one level of dirs. Let's create the rest */ + (void) mkdir_p(MAP_LINK_PREFIX, 0755); + (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755); + + map = sym_bpf_object__next_map(obj->obj, NULL); + while (map) { + _cleanup_free_ char *fn = NULL; + + fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map)); + if (!fn) + return log_oom(); + + r = sym_bpf_map__set_pin_path(map, fn); + if (r < 0) + return log_error_errno(r, "Failed to set pin path to '%s': %m", fn); + + map = sym_bpf_object__next_map(obj->obj, map); + } + } + + r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX); + if (r < 0) + return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m"); + + r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int)); + if (r < 0) + return log_error_errno(r, "Failed to size userns ring buffer: %m"); + + /* Dummy map to satisfy the verifier */ + dummy_mnt_id_hash_fd = make_inner_hash_map(); + if (dummy_mnt_id_hash_fd < 0) + return dummy_mnt_id_hash_fd; + + r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd); + if (r < 0) + return log_error_errno(r, "Failed to set inner BPF map: %m"); + + r = userns_restrict_bpf__load(obj); + if (r < 0) + return log_error_errno(r, "Failed to load BPF object: %m"); + + for (int i = 0; i < obj->skeleton->prog_cnt; i++) { + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + struct bpf_prog_skeleton *ps = obj->skeleton->progs + i; + _cleanup_free_ char *fn = NULL; + bool linked = false; + const char *e; + + e = startswith(ps->name, "userns_restrict_"); + assert(e); + + if (pin) { + fn = path_join(PROGRAM_LINK_PREFIX, e); + if (!fn) + return log_oom(); + + link = sym_bpf_link__open(fn); + r = bpf_get_error_translated(link); + if (r < 0) { + if (r != -ENOENT) + return log_error_errno(r, "Unable to open pinned program link: %m"); + link = NULL; + } else { + linked = true; + log_info("userns-restrict BPF-LSM program %s already attached.", ps->name); + } + } + + if (!link) { + link = sym_bpf_program__attach(*ps->prog); + r = bpf_get_error_translated(link); + if (r < 0) + return log_error_errno(r, "Failed to attach LSM BPF program: %m"); + + log_info("userns-restrict BPF-LSM program %s now attached.", ps->name); + } + + if (pin && !linked) { + assert(fn); + + r = sym_bpf_link__pin(link, fn); + if (r < 0) + return log_error_errno(r, "Failed to pin LSM attachment: %m"); + } + + *ps->link = TAKE_PTR(link); + } + + if (pin) { + r = sym_bpf_object__pin_maps(obj->obj, NULL); + if (r < 0) + return log_error_errno(r, "Failed to pin BPF maps: %m"); + } + + if (ret) + *ret = TAKE_PTR(obj); + + return 0; +} + +int userns_restrict_put_by_inode( + struct userns_restrict_bpf *obj, + uint64_t userns_inode, + bool replace, + const int mount_fds[], + size_t n_mount_fds) { + + _cleanup_close_ int inner_map_fd = -EBADF; + _cleanup_free_ int *mnt_ids = NULL; + uint64_t ino = userns_inode; + int r, outer_map_fd; + + assert(obj); + assert(userns_inode != 0); + assert(n_mount_fds == 0 || mount_fds); + + /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode + * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit + * inode let's refuse early, we can't support this with the current BPF code for now. */ + if (userns_inode > UINT32_MAX) + return -EINVAL; + + mnt_ids = new(int, n_mount_fds); + if (!mnt_ids) + return -ENOMEM; + + for (size_t i = 0; i < n_mount_fds; i++) { + r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i); + if (r < 0) + return log_debug_errno(r, "Failed to get mount ID: %m"); + } + + outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash); + if (outer_map_fd < 0) + return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m"); + + if (replace) { + /* Add if missing, replace if already exists */ + inner_map_fd = make_inner_hash_map(); + if (inner_map_fd < 0) + return inner_map_fd; + + r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY); + if (r < 0) + return log_debug_errno(r, "Failed to replace map in inode hash: %m"); + } else { + /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We + * might race against each other, hence we try a couple of times */ + for (size_t n_try = 10;; n_try--) { + uint32_t innermap_id; + + if (n_try == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), + "Stillcan't create inode entry in BPF map after 10 tries."); + + r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id); + if (r >= 0) { + inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id); + if (inner_map_fd < 0) + return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m"); + + break; + } + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to look up inode hash entry: %m"); + + /* No entry for this user namespace yet. Let's create one */ + inner_map_fd = make_inner_hash_map(); + if (inner_map_fd < 0) + return inner_map_fd; + + r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST); + if (r >= 0) + break; + if (errno != EEXIST) + return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m"); + } + } + + FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) { + uint32_t dummy_value = 1; + + r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY); + if (r < 0) + return log_debug_errno(r, "Failed to add mount ID to map: %m"); + + log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino); + } + + return 0; +} + +int userns_restrict_put_by_fd( + struct userns_restrict_bpf *obj, + int userns_fd, + bool replace, + const int mount_fds[], + size_t n_mount_fds) { + + struct stat st; + int r; + + assert(obj); + assert(userns_fd >= 0); + assert(n_mount_fds == 0 || mount_fds); + + r = fd_is_ns(userns_fd, CLONE_NEWUSER); + if (r < 0) + return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m"); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd."); + + if (fstat(userns_fd, &st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace: %m"); + + return userns_restrict_put_by_inode( + obj, + st.st_ino, + replace, + mount_fds, + n_mount_fds); +} + +int userns_restrict_reset_by_inode( + struct userns_restrict_bpf *obj, + uint64_t ino) { + + int r, outer_map_fd; + unsigned u; + + assert(obj); + assert(ino != 0); + + if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */ + return 0; + + outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash); + if (outer_map_fd < 0) + return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m"); + + u = (uint32_t) ino; + + r = sym_bpf_map_delete_elem(outer_map_fd, &u); + if (r < 0) + return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino); + + return 0; +} + +#else +int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); +} + +struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) { + return NULL; +} + +int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); +} + +int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); +} + +int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled."); +} +#endif diff --git a/src/nsresourced/userns-restrict.h b/src/nsresourced/userns-restrict.h new file mode 100644 index 0000000..37aed7b --- /dev/null +++ b/src/nsresourced/userns-restrict.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "macro.h" + +#if HAVE_VMLINUX_H +#include "bpf/userns_restrict/userns-restrict-skel.h" +#else +struct userns_restrict_bpf; +#endif + +int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret); +struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj); + +int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds); +int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds); + +int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode); + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct userns_restrict_bpf*, userns_restrict_bpf_free); |