diff options
Diffstat (limited to '')
-rw-r--r-- | src/nsresourced/nsresourcework.c | 1782 |
1 files changed, 1782 insertions, 0 deletions
diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c new file mode 100644 index 0000000..6bd2fed --- /dev/null +++ b/src/nsresourced/nsresourcework.c @@ -0,0 +1,1782 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <linux/nsfs.h> +#include <linux/veth.h> +#include <sys/eventfd.h> +#include <sys/stat.h> +#include <sys/wait.h> + +#include "sd-daemon.h" +#include "sd-netlink.h" + +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "group-record.h" +#include "io-util.h" +#include "lock-util.h" +#include "main-func.h" +#include "missing_magic.h" +#include "missing_mount.h" +#include "missing_syscall.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "netlink-util.h" +#include "process-util.h" +#include "random-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "strv.h" +#include "time-util.h" +#include "uid-classification.h" +#include "uid-range.h" +#include "user-record-nss.h" +#include "user-record.h" +#include "user-util.h" +#include "userdb.h" +#include "userns-registry.h" +#include "userns-restrict.h" +#include "varlink-io.systemd.NamespaceResource.h" +#include "varlink-io.systemd.UserDatabase.h" +#include "varlink.h" + +#define ITERATIONS_MAX 64U +#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE) +#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC) +#define CONNECTION_IDLE_USEC (15 * USEC_PER_SEC) +#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC) +#define USERNS_PER_UID 256 + +typedef struct LookupParameters { + const char *user_name; + const char *group_name; + union { + uid_t uid; + gid_t gid; + }; + const char *service; +} LookupParameters; + +static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, JsonVariant **ret) { + _cleanup_free_ char *name = NULL, *realname = NULL; + UserDisposition disposition; + int r; + + assert(userns_info); + assert(offset < userns_info->size); + + if (asprintf(&name, "ns-%s-" UID_FMT, userns_info->name, offset) < 0) + return -ENOMEM; + + if (userns_info->size > 1) { + disposition = USER_CONTAINER; + r = asprintf(&realname, "User " UID_FMT " of Allocated Namespace %s", offset, userns_info->name); + } else { + disposition = USER_DYNAMIC; + r = asprintf(&realname, "Allocated Namespace %s", userns_info->name); + } + if (r < 0) + return -ENOMEM; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(userns_info->start + offset)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)), + JSON_BUILD_PAIR("realName", JSON_BUILD_STRING(realname)), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")), + JSON_BUILD_PAIR("shell", JSON_BUILD_STRING(NOLOGIN)), + JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition))))); +} + +static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 }, + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .uid = UID_INVALID, + }; + uid_t offset; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.NamespaceResource")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (p.user_name) { + _cleanup_free_ char *n = NULL; + const char *e, *f; + + e = startswith(p.user_name, "ns-"); + if (!e) + goto not_found; + + f = strrchr(e, '-'); + if (!f) + goto not_found; + + if (parse_uid(f+1, &offset) < 0) + goto not_found; + + n = strndup(e, f - e); + if (!n) + return log_oom(); + + r = userns_registry_load_by_name( + /* registry_fd= */ -EBADF, + n, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + + if (uid_is_valid(p.uid) && p.uid != userns_info->start + offset) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + } else if (uid_is_valid(p.uid)) { + uid_t start, uidmask; + + if (uid_is_container(p.uid)) + uidmask = (uid_t) UINT32_C(0xFFFF0000); + else if (uid_is_dynamic(p.uid)) + uidmask = (uid_t) UINT32_C(0xFFFFFFFF); + else + goto not_found; + + start = p.uid & uidmask; + offset = p.uid - start; + + r = userns_registry_load_by_start_uid( + /* registry_fd= */ -EBADF, + start, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + } else + return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL); + + r = build_user_json(userns_info, offset, &v); + if (r < 0) + return r; + + return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v)))); + +not_found: + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static int build_group_json(UserNamespaceInfo *userns_info, gid_t offset, JsonVariant **ret) { + _cleanup_free_ char *name = NULL, *description = NULL; + UserDisposition disposition; + int r; + + assert(userns_info); + assert(offset < userns_info->size); + + if (asprintf(&name, "ns-%s-" GID_FMT, userns_info->name, offset) < 0) + return -ENOMEM; + + if (userns_info->size > 1) { + disposition = USER_CONTAINER; + r = asprintf(&description, "Group " GID_FMT " of Allocated Namespace %s", offset, userns_info->name); + } else { + disposition = USER_DYNAMIC; + r = asprintf(&description, "Allocated Namespace %s", userns_info->name); + } + if (r < 0) + return -ENOMEM; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(userns_info->start + offset)), + JSON_BUILD_PAIR("description", JSON_BUILD_STRING(description)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition))))); +} + +static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .gid = GID_INVALID, + }; + gid_t offset; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.NamespaceResource")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (p.group_name) { + _cleanup_free_ char *n = NULL; + const char *e, *f; + + e = startswith(p.group_name, "ns-"); + if (!e) + goto not_found; + + f = strrchr(e, '-'); + if (!f) + goto not_found; + + if (parse_gid(f+1, &offset) < 0) + goto not_found; + + n = strndup(e, f - e); + if (!n) + return log_oom(); + + r = userns_registry_load_by_name( + /* registry_fd= */ -EBADF, + n, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + + if (gid_is_valid(p.gid) && p.uid != userns_info->start + offset) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + } else if (gid_is_valid(p.gid)) { + gid_t start, gidmask; + + if (gid_is_container(p.gid)) + gidmask = (gid_t) UINT32_C(0xFFFF0000); + else if (gid_is_dynamic(p.gid)) + gidmask = (gid_t) UINT32_C(0xFFFFFFFF); + else + goto not_found; + + start = p.gid & gidmask; + offset = p.gid - start; + + r = userns_registry_load_by_start_uid( + /* registry_fd= */ -EBADF, + (uid_t) start, + &userns_info); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + if (offset >= userns_info->size) /* Outside of range? */ + goto not_found; + } else + return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL); + + r = build_group_json(userns_info, offset, &v); + if (r < 0) + return r; + + return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v)))); + +not_found: + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + LookupParameters p = {}; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.NamespaceResource")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + /* We don't support auxiliary groups for namespace allocations */ + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static int uid_is_available( + int registry_dir_fd, + uid_t candidate) { + + int r; + + assert(registry_dir_fd >= 0); + + log_debug("Checking if UID " UID_FMT " is available.", candidate); + + r = userns_registry_uid_exists(registry_dir_fd, candidate); + if (r < 0) + return r; + if (r > 0) + return false; + + r = userdb_by_uid(candidate, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + r = groupdb_by_gid(candidate, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + log_debug("UID " UID_FMT " is available.", candidate); + + return true; +} + +static int name_is_available( + int registry_dir_fd, + const char *name) { + + _cleanup_free_ char *user_name = NULL; + int r; + + assert(registry_dir_fd >= 0); + assert(name); + + r = userns_registry_name_exists(registry_dir_fd, name); + if (r < 0) + return r; + if (r > 0) + return false; + + user_name = strjoin("ns-", name, "-0"); + if (!user_name) + return -ENOMEM; + + r = userdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + r = groupdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + log_debug("Namespace name '%s' is available.", name); + + return true; +} + +static int allocate_now( + int registry_dir_fd, + UserNamespaceInfo *info, + int *ret_lock_fd) { + + static const uint8_t hash_key[16] = { + 0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd, + 0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee, + }; + + _cleanup_(uid_range_freep) UIDRange *valid_range = NULL; + uid_t candidate, uidmin, uidmax, uidmask; + unsigned n_tries = 100; + int r; + + /* Returns the following error codes: + * + * EBUSY → all UID candidates we checked are already taken + * EEXIST → the name for the userns already exists + * EDEADLK → the userns is already registered in the registry + */ + + assert(registry_dir_fd >= 0); + assert(info); + + switch (info->size) { + + case 0x10000U: + uidmin = CONTAINER_UID_BASE_MIN; + uidmax = CONTAINER_UID_BASE_MAX; + uidmask = (uid_t) UINT32_C(0xFFFF0000); + break; + + case 1U: + uidmin = DYNAMIC_UID_MIN; + uidmax = DYNAMIC_UID_MAX; + uidmask = (uid_t) UINT32_C(0xFFFFFFFF); + break; + + default: + assert_not_reached(); + } + + r = uid_range_load_userns(/* path= */ NULL, UID_RANGE_USERNS_INSIDE, &valid_range); + if (r < 0) + return r; + + /* Check early whether we have any chance at all given our own uid range */ + if (!uid_range_overlaps(valid_range, uidmin, uidmax)) + return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate."); + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + /* Enforce limit on user namespaces per UID */ + r = userns_registry_per_uid(registry_dir_fd, info->owner); + if (r < 0) + return log_debug_errno(r, "Failed to determine number of currently registered user namespaces per UID " UID_FMT ": %m", info->owner); + if (r >= USERNS_PER_UID) + return log_debug_errno(SYNTHETIC_ERRNO(EUSERS), "User already registered %i user namespaces, refusing.", r); + + r = userns_registry_inode_exists(registry_dir_fd, info->userns_inode); + if (r < 0) + return r; + if (r > 0) + return -EDEADLK; + + r = name_is_available(registry_dir_fd, info->name); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */ + candidate = random_u32()) { /* Use random values afterwards */ + + if (--n_tries <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available."); + + candidate = (candidate % (uidmax - uidmin)) + uidmin; + candidate &= uidmask; + + if (!uid_range_covers(valid_range, candidate, info->size)) + continue; + + /* We only check the base UID for each range (!) */ + r = uid_is_available(registry_dir_fd, candidate); + if (r < 0) + return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate); + if (r > 0) { + info->start = candidate; + + log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1); + + if (ret_lock_fd) + *ret_lock_fd = TAKE_FD(lock_fd); + + return 0; + } + + log_debug("UID range " UID_FMT " already taken.", candidate); + } +} + +static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) { + _cleanup_(sigkill_waitp) pid_t pid = 0; + _cleanup_close_ int efd = -EBADF; + uint64_t u; + int r; + + assert(usernsfd >= 0); + assert(userns_info); + assert(uid_is_valid(userns_info->target)); + assert(uid_is_valid(userns_info->start)); + assert(userns_info->size > 0); + assert(userns_info->size <= UINT32_MAX - userns_info->start); + + efd = eventfd(0, EFD_CLOEXEC); + if (efd < 0) + return log_error_errno(errno, "Failed to allocate eventfd(): %m"); + + r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* child */ + + if (setns(usernsfd, CLONE_NEWUSER) < 0) { + log_error_errno(errno, "Failed to join user namespace: %m"); + goto child_fail; + } + + if (eventfd_write(efd, 1) < 0) { + log_error_errno(errno, "Failed to ping event fd: %m"); + goto child_fail; + } + + freeze(); + + child_fail: + _exit(EXIT_FAILURE); + } + + /* Wait until child joined the user namespace */ + if (eventfd_read(efd, &u) < 0) + return log_error_errno(errno, "Failed to wait for event fd: %m"); + + /* Now write mapping */ + + _cleanup_free_ char *pmap = NULL; + + if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pid) < 0) + return log_oom(); + + r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " " UID_FMT "\n", userns_info->target, userns_info->start, userns_info->size); + if (r < 0) + return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m"); + + pmap = mfree(pmap); + if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pid) < 0) + return log_oom(); + + r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " " GID_FMT "\n", (gid_t) userns_info->target, (gid_t) userns_info->start, (gid_t) userns_info->size); + if (r < 0) + return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m"); + + /* We are done! */ + + log_debug("Successfully configured user namespace."); + return 0; +} + +static int test_userns_api_support(Varlink *link) { + int r; + + assert(link); + + /* We only expose the userns API if our manager daemon told us this OK to do. It will set this + * boolean only if it managed to set up BPF correctly for itself (i.e. watches for userns going away + * via BPF APIs). This should make very sure we don't accidentally allow any of the userns stuff to + * go through without the BPF LSM in effect. */ + + r = getenv_bool("NSRESOURCE_API"); + if (r < 0) + return log_error_errno(r, "Failed to parse $NSRESOURCE_API: %m"); + if (r == 0) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceInterfaceNotSupported", NULL); + + return 0; +} + +static int validate_name(Varlink *link, const char *name, char **ret) { + _cleanup_free_ char *un = NULL; + int r; + + assert(link); + assert(name); + assert(ret); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (peer_uid == 0) { + if (!userns_name_is_valid(name)) + return varlink_error_invalid_parameter_name(link, "name"); + + un = strdup(name); + if (!un) + return -ENOMEM; + } else { + /* The the client is not root then prefix the name with the UID of the peer, so that they + * live in separate namespaces and cannot steal each other's names. */ + + if (asprintf(&un, UID_FMT "-%s", peer_uid, name) < 0) + return -ENOMEM; + + if (!userns_name_is_valid(un)) + return varlink_error_invalid_parameter_name(link, "name"); + } + + *ret = TAKE_PTR(un); + return 0; +} + +static int validate_target_and_size(Varlink *link, unsigned target, unsigned size) { + assert(link); + + if (!IN_SET(size, 1U, 0x10000)) + return varlink_error_invalid_parameter_name(link, "size"); + + if (!uid_is_valid(target) || target > UINT32_MAX - size) + return varlink_error_invalid_parameter_name(link, "target"); + + return 0; +} + +static int validate_userns(Varlink *link, int userns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + + r = fd_verify_safe_flags(userns_fd); + if (r < 0) + return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m"); + + /* Validate this is actually a valid user namespace fd */ + r = fd_is_ns(userns_fd, CLONE_NEWUSER); + if (r < 0) + return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m"); + if (r == 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* And refuse the thing if it is our own */ + r = is_our_namespace(userns_fd, NAMESPACE_USER); + if (r < 0) + return log_debug_errno(r, "Failed to check if user namespace fd refers to our own user namespace: %m"); + if (r > 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return log_debug_errno(r, "Failed to acquire peer UID: %m"); + + if (peer_uid != 0) { + /* Refuse if the userns is not actually owned by our client. */ + uid_t owner_uid; + if (ioctl(userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0) + return log_debug_errno(errno, "Failed to get owner UID of user namespace: %m"); + + if (owner_uid != peer_uid) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + } + + return 0; +} + +static int validate_userns_is_empty(Varlink *link, int userns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + + _cleanup_(uid_range_freep) UIDRange *range = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns UID range: %m"); + + if (!uid_range_is_empty(range)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + range = uid_range_free(range); + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns GID range: %m"); + + if (!uid_range_is_empty(range)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + return 0; +} + +typedef struct AllocateParameters { + const char *name; + unsigned size; + unsigned target; + unsigned userns_fd_idx; +} AllocateParameters; + +static int vl_method_allocate_user_range(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "name", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AllocateParameters, name), JSON_MANDATORY }, + { "size", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, size), JSON_MANDATORY }, + { "target", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, target), 0 }, + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), JSON_MANDATORY }, + {} + }; + + struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata); + _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF, lock_fd = -EBADF; + _cleanup_free_ char *userns_name = NULL; + uid_t peer_uid; + struct stat userns_st; + AllocateParameters p = { + .size = UINT_MAX, + .userns_fd_idx = UINT_MAX, + }; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = validate_name(link, p.name, &userns_name); + if (r != 0) + return r; + + r = validate_target_and_size(link, p.target, p.size); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m"); + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + r = validate_userns_is_empty(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (!*bpf) { + r = userns_restrict_install(/* pin= */ true, bpf); + if (r < 0) + return r; + } + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new(); + if (!userns_info) + return -ENOMEM; + + userns_info->name = TAKE_PTR(userns_name); + if (!userns_info->name) + return -ENOMEM; + + userns_info->owner = peer_uid; + userns_info->userns_inode = userns_st.st_ino; + userns_info->size = p.size; + userns_info->target = p.target; + + r = allocate_now(registry_dir_fd, userns_info, &lock_fd); + if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */ + return varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL); + if (r == -EBUSY) /* All used up */ + return varlink_error(link, "io.systemd.NamespaceResource.NoDynamicRange", NULL); + if (r == -EDEADLK) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL); + if (r == -EEXIST) + return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL); + if (r < 0) + return r; + + r = userns_registry_store(registry_dir_fd, userns_info); + if (r < 0) + return r; + + /* Register the userns in the BPF map with an empty allowlist */ + r = userns_restrict_put_by_fd( + *bpf, + userns_fd, + /* replace= */ true, + /* mount_fds= */ NULL, + /* n_mount_fds= */ 0); + if (r < 0) + goto fail; + + r = write_userns(userns_fd, userns_info); + if (r < 0) + goto fail; + + lock_fd = safe_close(lock_fd); + + /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */ + r = sd_pid_notifyf_with_fds( + /* pid= */ 0, + /* unset_environment= */ false, + &userns_fd, 1, + "FDSTORE=1\n" + "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode); + if (r < 0) + goto fail; + + /* Note, we'll not return UID values from the host, since the child might not run in the same + * user namespace as us. If they want to know the ranges they should read them off the userns fd, so + * that they are translated into their PoV */ + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); + +fail: + /* Note: we don't have to clean-up the BPF maps in the error path: the bpf map type used will + * automatically do that once the userns inode goes away */ + userns_registry_remove(registry_dir_fd, userns_info); + return r; +} + +static int validate_userns_is_safe(Varlink *link, int userns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + + /* Read the outside UID range and verify it isn't empty */ + _cleanup_(uid_range_freep) UIDRange *outside_range = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &outside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns UID range: %m"); + if (uid_range_is_empty(outside_range)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* Read the outside GID range and check it is the same as the UID range */ + _cleanup_(uid_range_freep) UIDRange *outside_range_gid = NULL; + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &outside_range_gid); + if (r < 0) + return log_debug_errno(r, "Failed to read userns GID range: %m"); + if (!uid_range_equal(outside_range, outside_range_gid)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* Read the inside UID range, and verify it matches the size of the outside UID range */ + _cleanup_(uid_range_freep) UIDRange *inside_range = NULL; + r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &inside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns contents: %m"); + if (uid_range_size(outside_range) != uid_range_size(inside_range)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Uh, inside and outside UID range sizes don't match."); + + /* Read the inside GID range, and verify it matches the inside UID range */ + _cleanup_(uid_range_freep) UIDRange *inside_range_gid = NULL; + r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &inside_range_gid); + if (r < 0) + return log_debug_errno(r, "Failed to read userns contents: %m"); + if (!uid_range_equal(inside_range, inside_range_gid)) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + uid_t peer_gid; + r = varlink_get_peer_gid(link, &peer_gid); + if (r < 0) + return r; + + /* Insist that the first UID/GID in the range matches the client's UID/GID */ + if (outside_range->entries[0].start != peer_uid || + outside_range_gid->entries[0].start != peer_gid) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* If there are more than one UID in the range, then also insist that the first UID maps to root inside the userns */ + if (uid_range_size(outside_range) > 1 && inside_range->entries[0].start != 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + return 0; +} + +typedef struct RegisterParameters { + const char *name; + unsigned userns_fd_idx; +} RegisterParameters; + +static int vl_method_register_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "name", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(RegisterParameters, name), JSON_MANDATORY }, + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(RegisterParameters, userns_fd_idx), JSON_MANDATORY }, + {} + }; + + struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata); + _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF; + _cleanup_free_ char *userns_name = NULL; + uid_t peer_uid; + struct stat userns_st; + RegisterParameters p = { + .userns_fd_idx = UINT_MAX, + }; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = validate_name(link, p.name, &userns_name); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return userns_fd; + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + r = validate_userns_is_safe(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (!*bpf) { + r = userns_restrict_install(/* pin= */ true, bpf); + if (r < 0) + return r; + } + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + r = userns_registry_inode_exists(registry_dir_fd, userns_st.st_ino); + if (r < 0) + return r; + if (r > 0) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL); + + r = name_is_available(registry_dir_fd, userns_name); + if (r < 0) + return r; + if (r == 0) + return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL); + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new(); + if (!userns_info) + return -ENOMEM; + + userns_info->name = TAKE_PTR(userns_name); + if (!userns_info->name) + return -ENOMEM; + + userns_info->owner = peer_uid; + userns_info->userns_inode = userns_st.st_ino; + + r = userns_registry_store(registry_dir_fd, userns_info); + if (r < 0) + return log_debug_errno(r, "Failed to update userns registry: %m"); + + /* Register the userns in the BPF map with an empty allowlist */ + r = userns_restrict_put_by_fd( + *bpf, + userns_fd, + /* replace= */ true, + /* mount_fds= */ NULL, + /* n_mount_fds= */ 0); + if (r < 0) + goto fail; + + /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */ + r = sd_pid_notifyf_with_fds( + /* pid= */ 0, + /* unset_environment= */ false, + &userns_fd, 1, + "FDSTORE=1\n" + "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode); + if (r < 0) + goto fail; + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); + +fail: + userns_registry_remove(registry_dir_fd, userns_info); + return r; +} + +typedef struct AddMountParameters { + unsigned userns_fd_idx; + unsigned mount_fd_idx; +} AddMountParameters; + +static int vl_method_add_mount_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch parameter_dispatch_table[] = { + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, userns_fd_idx), JSON_MANDATORY }, + { "mountFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, mount_fd_idx), JSON_MANDATORY }, + {} + }; + + _cleanup_close_ int userns_fd = -EBADF, mount_fd = -EBADF, registry_dir_fd = -EBADF; + struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata); + AddMountParameters p = { + .userns_fd_idx = UINT_MAX, + .mount_fd_idx = UINT_MAX, + }; + int r, mnt_id = 0; + struct stat userns_st; + uid_t peer_uid; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + /* Allowlisting arbitrary mounts is a privileged operation */ + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + if (peer_uid != 0) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return userns_fd; + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return -errno; + + mount_fd = varlink_take_fd(link, p.mount_fd_idx); + if (mount_fd < 0) + return mount_fd; + + r = fd_verify_safe_flags_full(mount_fd, O_PATH|O_DIRECTORY); + if (r < 0) + return log_debug_errno(r, "Mount file descriptor has unsafe flags set: %m"); + + r = fd_verify_directory(mount_fd); + if (r < 0) + return r; + + r = path_get_mnt_id_at(mount_fd, NULL, &mnt_id); + if (r < 0) + return r; + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + r = userns_registry_load_by_userns_inode( + registry_dir_fd, + userns_st.st_ino, + &userns_info); + if (r == -ENOENT) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return r; + + if (!*bpf) { + r = userns_restrict_install(/* pin= */ true, bpf); + if (r < 0) + return r; + } + + /* Pin the mount fd */ + r = sd_pid_notifyf_with_fds( + /* pid= */ 0, + /* unset_environment= */ false, + &mount_fd, 1, + "FDSTORE=1\n" + "FDNAME=userns-" INO_FMT "\n", userns_st.st_ino); + if (r < 0) + return r; + + /* Add this mount to the user namespace's BPF map allowlist entry. */ + r = userns_restrict_put_by_fd( + *bpf, + userns_fd, + /* replace= */ false, + &mount_fd, + 1); + if (r < 0) + return r; + + if (userns_info->size > 0) + log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s' @ UID " UID_FMT ")", + mnt_id, userns_st.st_ino, userns_info->name, userns_info->start); + else + log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s')", + mnt_id, userns_st.st_ino, userns_info->name); + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); +} + +static int validate_cgroup(Varlink *link, int fd, uint64_t *ret_cgroup_id) { + int r; + + assert(link); + assert(fd >= 0); + assert(ret_cgroup_id); + + r = fd_verify_safe_flags_full(fd, O_DIRECTORY); + if (r < 0) + return log_debug_errno(r, "Control group file descriptor has unsafe flags set: %m"); + + r = fd_verify_directory(fd); + if (r < 0) + return log_debug_errno(r, "Verification that cgroup fd refers to directory failed: %m"); + + r = fd_is_fs_type(fd, CGROUP2_SUPER_MAGIC); + if (r < 0) + return log_debug_errno(r, "Failed to check if cgroup fd actually refers to cgroupfs: %m"); + if (r == 0) + return varlink_error_invalid_parameter_name(link, "controlGroupFileDescriptor"); + + r = cg_fd_get_cgroupid(fd, ret_cgroup_id); + if (r < 0) + return log_debug_errno(r, "Failed to read cgroup ID from cgroupfs: %m"); + + return 0; +} + +typedef struct AddCGroupParameters { + unsigned userns_fd_idx; + unsigned cgroup_fd_idx; +} AddCGroupParameters; + +static int vl_method_add_cgroup_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch parameter_dispatch_table[] = { + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, userns_fd_idx), JSON_MANDATORY }, + { "controlGroupFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, cgroup_fd_idx), JSON_MANDATORY }, + {} + }; + + _cleanup_close_ int userns_fd = -EBADF, cgroup_fd = -EBADF, registry_dir_fd = -EBADF; + AddCGroupParameters p = { + .userns_fd_idx = UINT_MAX, + .cgroup_fd_idx = UINT_MAX, + }; + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + struct stat userns_st, cgroup_st; + uid_t peer_uid; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m"); + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + + cgroup_fd = varlink_take_fd(link, p.cgroup_fd_idx); + if (cgroup_fd < 0) + return log_debug_errno(cgroup_fd, "Failed to take cgroup fd from Varlink connection: %m"); + + uint64_t cgroup_id; + r = validate_cgroup(link, cgroup_fd, &cgroup_id); + if (r != 0) + return r; + + if (fstat(cgroup_fd, &cgroup_st) < 0) + return log_debug_errno(errno, "Failed to fstat() cgroup fd: %m"); + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return lock_fd; + + r = userns_registry_load_by_userns_inode( + registry_dir_fd, + userns_st.st_ino, + &userns_info); + if (r == -ENOENT) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return r; + + /* The user namespace must have a user assigned */ + if (userns_info->size == 0) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceWithoutUserRange", NULL); + if (userns_info_has_cgroup(userns_info, cgroup_id)) + return varlink_error(link, "io.systemd.NamespaceResource.ControlGroupAlreadyAdded", NULL); + if (userns_info->n_cgroups > USER_NAMESPACE_CGROUPS_DELEGATE_MAX) + return varlink_error(link, "io.systemd.NamespaceResource.TooManyControlGroups", NULL); + + /* Registering a cgroup for this client is only allowed for the root or the owner of a userns */ + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return log_debug_errno(r, "Failed to get connection peer: %m"); + if (peer_uid != 0) { + if (peer_uid != userns_info->owner) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + /* The cgroup must be owned by the owner of the userns */ + if (cgroup_st.st_uid != userns_info->owner) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + } + + r = userns_info_add_cgroup(userns_info, cgroup_id); + if (r < 0) + return r; + + r = userns_registry_store(registry_dir_fd, userns_info); + if (r < 0) + return r; + + if (fchown(cgroup_fd, userns_info->start, userns_info->start) < 0) + return log_debug_errno(errno, "Failed to change ownership of cgroup: %m"); + + if (fchmod(cgroup_fd, 0755) < 0) + return log_debug_errno(errno, "Failed to change access mode of cgroup: %m"); + + FOREACH_STRING(attr, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") { + (void) fchmodat(cgroup_fd, attr, 0644, AT_SYMLINK_NOFOLLOW); + (void) fchownat(cgroup_fd, attr, userns_info->start, userns_info->start, AT_SYMLINK_NOFOLLOW); + } + + log_debug("Granting ownership to cgroup %" PRIu64 " to userns " INO_FMT " ('%s' @ UID " UID_FMT ")", + cgroup_id, userns_st.st_ino, userns_info->name, userns_info->start); + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); +} + +static uint64_t hash_ifname_id(UserNamespaceInfo *userns_info, const char *ifname) { + struct siphash state; + + assert(userns_info); + + siphash24_init(&state, (const uint8_t[]) { 0xc4, 0x6c, 0x96, 0xe8, 0xad, 0x37, 0x4d, 0x5f, 0xa1, 0xae, 0xfe, 0x70, 0x40, 0xed, 0x41, 0x5f }); + siphash24_compress_string(userns_info->name, &state); + siphash24_compress_byte(0, &state); /* separator */ + siphash24_compress_string(strempty(ifname), &state); + + return siphash24_finalize(&state); +} + +static void hash_ether_addr(UserNamespaceInfo *userns_info, const char *ifname, uint64_t n, struct ether_addr *ret) { + struct siphash state; + uint64_t h; + + assert(userns_info); + assert(ret); + + siphash24_init(&state, (const uint8_t[]) { 0x36, 0xaa, 0xd1, 0x69, 0xc7, 0xe5, 0x4c, 0xaa, 0x1e, 0xb2, 0x9e, 0xb3, 0x3a, 0x6b, 0xd4, 0x71 }); + siphash24_compress_string(userns_info->name, &state); + siphash24_compress_byte(0, &state); /* separator */ + siphash24_compress_string(strempty(ifname), &state); + siphash24_compress_byte(0, &state); /* separator */ + n = htole64(n); /* add the 'index' to the mix in an endianess-independent fashion */ + siphash24_compress(&n, sizeof(n), &state); + + h = htole64(siphash24_finalize(&state)); + + assert(sizeof(h) >= sizeof_field(struct ether_addr, ether_addr_octet)); + + memcpy(ret->ether_addr_octet, &h, sizeof_field(struct ether_addr, ether_addr_octet)); + ether_addr_mark_random(ret); +} + +static int create_veth( + int netns_fd, + const char *ifname_host, + const char *altifname_host, + struct ether_addr *mac_host, + const char *ifname_namespace, + struct ether_addr *mac_namespace) { + + int r; + + assert(netns_fd >= 0); + assert(ifname_host); + assert(mac_host); + assert(ifname_namespace); + assert(mac_namespace); + + log_debug("Creating veth link on host %s (%s) with address %s to container as %s with address %s", + ifname_host, strna(altifname_host), ETHER_ADDR_TO_STR(mac_host), + ifname_namespace, ETHER_ADDR_TO_STR(mac_namespace)); + + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + r = sd_netlink_open(&rtnl); + if (r < 0) + return r; + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container(m, VETH_INFO_PEER); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_namespace); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_namespace); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new veth interfaces (%s:%s): %m", ifname_host, ifname_namespace); + + r = rtnl_set_link_alternative_names_by_ifname(&rtnl, ifname_host, STRV_MAKE(altifname_host)); + if (r < 0) + log_warning_errno(r, "Failed to set alternative interface name to '%s', ignoring: %m", altifname_host); + + return 0; +} + +static int validate_netns(Varlink *link, int userns_fd, int netns_fd) { + int r; + + assert(link); + assert(userns_fd >= 0); + assert(netns_fd >= 0); + + r = fd_verify_safe_flags(netns_fd); + if (r < 0) + return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m"); + + /* Validate this is actually a valid network namespace fd */ + r = fd_is_ns(netns_fd, CLONE_NEWNET); + if (r < 0) + return r; + if (r == 0) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + + /* And refuse the thing if it is our own */ + r = is_our_namespace(netns_fd, NAMESPACE_NET); + if (r < 0) + return r; + if (r > 0) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + + /* Check if the netns actually belongs to the userns */ + _cleanup_close_ int owner_userns_fd = -EBADF; + owner_userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_userns_fd < 0) + return -errno; + + r = inode_same_at(owner_userns_fd, /* path_a= */ NULL, userns_fd, /* path_b= */ NULL, AT_EMPTY_PATH); + if (r < 0) + return r; + if (r == 0) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + + uid_t peer_uid; + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + + if (peer_uid != 0) { + /* Refuse if the netns is not actually owned by our client. */ + + uid_t owner_uid; + if (ioctl(owner_userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0) + return -errno; + + if (owner_uid != peer_uid) + return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor"); + } + + return 0; +} + +typedef struct AddNetworkParameters { + unsigned userns_fd_idx; + unsigned netns_fd_idx; + const char *ifname; + const char *mode; +} AddNetworkParameters; + +static int vl_method_add_netif_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch parameter_dispatch_table[] = { + { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddNetworkParameters, userns_fd_idx), JSON_MANDATORY }, + { "networkNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddNetworkParameters, netns_fd_idx), JSON_MANDATORY }, + { "namespaceInterfaceName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AddNetworkParameters, ifname), 0 }, + { "mode", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(AddNetworkParameters, mode), JSON_MANDATORY }, + {} + }; + + _cleanup_close_ int userns_fd = -EBADF, netns_fd = -EBADF, registry_dir_fd = -EBADF; + AddNetworkParameters p = { + .userns_fd_idx = UINT_MAX, + }; + _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL; + struct stat userns_st; + uid_t peer_uid; + int r; + + assert(link); + assert(parameters); + + r = test_userns_api_support(link); + if (r != 0) + return r; + + r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p); + if (r != 0) + return r; + + userns_fd = varlink_take_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return userns_fd; + + r = validate_userns(link, userns_fd); + if (r != 0) + return r; + + if (fstat(userns_fd, &userns_st) < 0) + return -errno; + + netns_fd = varlink_take_fd(link, p.netns_fd_idx); + if (netns_fd < 0) + return netns_fd; + + r = validate_netns(link, userns_fd, netns_fd); + if (r != 0) + return r; + + if (!streq_ptr(p.mode, "veth")) + return varlink_error_invalid_parameter_name(link, "mode"); + + if (p.ifname && !ifname_valid(p.ifname)) + return varlink_error_invalid_parameter_name(link, "interfaceName"); + + registry_dir_fd = userns_registry_open_fd(); + if (registry_dir_fd < 0) + return registry_dir_fd; + + _cleanup_close_ int lock_fd = -EBADF; + lock_fd = userns_registry_lock(registry_dir_fd); + if (lock_fd < 0) + return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m"); + + r = userns_registry_load_by_userns_inode( + registry_dir_fd, + userns_st.st_ino, + &userns_info); + if (r == -ENOENT) + return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL); + if (r < 0) + return r; + + /* Registering a network interface for this client is only allowed for the root or the owner of a userns */ + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return r; + if (peer_uid != 0 && peer_uid != userns_info->owner) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL); + + _cleanup_free_ char *ifname_host = NULL, *altifname_host = NULL; + const char *ifname_namespace = p.ifname ?: "host0"; + + /* The short ifname is just too short to generate readable and unique names where unprivileged users + * can't take each others names. Hence just hash it. The alternative name however contains more useful + * information. */ + if (asprintf(&ifname_host, "ns-%08" PRIx64, hash_ifname_id(userns_info, p.ifname)) < 0) + return -ENOMEM; + strshorten(ifname_host, IFNAMSIZ-1); + + if (p.ifname) + r = asprintf(&altifname_host, "ns-" UID_FMT "-%s-%s", userns_info->owner, userns_info->name, p.ifname); + else + r = asprintf(&altifname_host, "ns-" UID_FMT "-%s", userns_info->owner, userns_info->name); + if (r < 0) + return -ENOMEM; + + struct ether_addr ether_addr_host, ether_addr_namespace; + + hash_ether_addr(userns_info, p.ifname, 0, ðer_addr_host); + hash_ether_addr(userns_info, p.ifname, 1, ðer_addr_namespace); + + r = create_veth(netns_fd, + ifname_host, altifname_host, ðer_addr_host, + ifname_namespace, ðer_addr_namespace); + if (r < 0) + return r; + + log_debug("Adding veth tunnel %s from host to userns " INO_FMT " ('%s' @ UID " UID_FMT ", interface %s).", + ifname_host, userns_st.st_ino, userns_info->name, userns_info->start, ifname_namespace); + + return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hostInterfaceName", JSON_BUILD_STRING(ifname_host)), + JSON_BUILD_PAIR("namespaceInterfaceName", JSON_BUILD_STRING(ifname_namespace)))); +} + +static int process_connection(VarlinkServer *server, int _fd) { + _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */ + _cleanup_(varlink_close_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_server_add_connection(server, fd, &vl); + if (r < 0) + return log_error_errno(r, "Failed to add connection: %m"); + + TAKE_FD(fd); + vl = varlink_ref(vl); + + r = varlink_set_allow_fd_passing_input(vl, true); + if (r < 0) + return log_error_errno(r, "Failed to enable fd passing for read: %m"); + + r = varlink_set_allow_fd_passing_output(vl, true); + if (r < 0) + return log_error_errno(r, "Failed to enable fd passing for write: %m"); + + for (;;) { + r = varlink_process(vl); + if (r == -ENOTCONN) { + log_debug("Connection terminated."); + break; + } + if (r < 0) + return log_error_errno(r, "Failed to process connection: %m"); + if (r > 0) + continue; + + r = varlink_wait(vl, CONNECTION_IDLE_USEC); + if (r < 0) + return log_error_errno(r, "Failed to wait for connection events: %m"); + if (r == 0) + break; + } + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *bpf = NULL; + usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY; + _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL; + _cleanup_(pidref_done) PidRef parent = PIDREF_NULL; + unsigned n_iterations = 0; + int m, listen_fd, r; + + log_setup(); + + m = sd_listen_fds(false); + if (m < 0) + return log_error_errno(m, "Failed to determine number of listening fds: %m"); + if (m == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received."); + if (m > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time."); + + listen_fd = SD_LISTEN_FDS_START; + + r = fd_nonblock(listen_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m"); + + r = varlink_server_new(&server, VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_error_errno(r, "Failed to allocate server: %m"); + + r = varlink_server_add_interface_many( + server, + &vl_interface_io_systemd_NamespaceResource, + &vl_interface_io_systemd_UserDatabase); + if (r < 0) + return log_error_errno(r, "Failed to add UserDatabase and NamespaceResource interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + server, + "io.systemd.NamespaceResource.AllocateUserRange", vl_method_allocate_user_range, + "io.systemd.NamespaceResource.RegisterUserNamespace", vl_method_register_user_namespace, + "io.systemd.NamespaceResource.AddMountToUserNamespace", vl_method_add_mount_to_user_namespace, + "io.systemd.NamespaceResource.AddControlGroupToUserNamespace", vl_method_add_cgroup_to_user_namespace, + "io.systemd.NamespaceResource.AddNetworkToUserNamespace", vl_method_add_netif_to_user_namespace, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships); + if (r < 0) + return log_error_errno(r, "Failed to bind methods: %m"); + + varlink_server_set_userdata(server, &bpf); + + r = getenv_bool("NSRESOURCE_FIXED_WORKER"); + if (r < 0) + return log_error_errno(r, "Failed to parse NSRESOURCE_FIXED_WORKER: %m"); + listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC; + + r = pidref_set_parent(&parent); + if (r < 0) + return log_error_errno(r, "Failed to acquire pidfd of parent process: %m"); + + start_time = now(CLOCK_MONOTONIC); + + for (;;) { + _cleanup_close_ int fd = -EBADF; + usec_t n; + + /* Exit the worker in regular intervals, to flush out all memory use */ + if (n_iterations++ > ITERATIONS_MAX) { + log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations); + break; + } + + n = now(CLOCK_MONOTONIC); + if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) { + log_debug("Exiting worker, ran for %s, that's enough.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0)); + break; + } + + if (last_busy_usec == USEC_INFINITY) + last_busy_usec = n; + else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) { + log_debug("Exiting worker, been idle for %s.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0)); + break; + } + + (void) rename_process("systemd-nsresourcework: waiting..."); + fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC)); + (void) rename_process("systemd-nsresourcework: processing..."); + + if (fd == -EAGAIN) + continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected + * after a while, let's check if it's time to exit though. */ + if (fd == -EINTR) + continue; /* Might be that somebody attached via strace, let's just continue in that + * case */ + if (fd < 0) + return log_error_errno(fd, "Failed to accept() from listening socket: %m"); + + if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) { + /* We only slept a very short time? If so, let's see if there are more sockets + * pending, and if so, let's ask our parent for more workers */ + + r = fd_wait_for_event(listen_fd, POLLIN, 0); + if (r < 0) + return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m"); + + if (FLAGS_SET(r, POLLIN)) { + r = pidref_kill(&parent, SIGUSR2); + if (r == -ESRCH) + return log_error_errno(r, "Parent already died?"); + if (r < 0) + return log_error_errno(r, "Failed to send SIGUSR2 signal to parent. %m"); + } + } + + (void) process_connection(server, TAKE_FD(fd)); + last_busy_usec = USEC_INFINITY; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); |