diff options
Diffstat (limited to '')
-rw-r--r-- | src/mountfsd/io.systemd.mount-file-system.policy | 70 | ||||
-rw-r--r-- | src/mountfsd/meson.build | 26 | ||||
-rw-r--r-- | src/mountfsd/mountfsd-manager.c | 277 | ||||
-rw-r--r-- | src/mountfsd/mountfsd-manager.h | 30 | ||||
-rw-r--r-- | src/mountfsd/mountfsd.c | 43 | ||||
-rw-r--r-- | src/mountfsd/mountwork.c | 703 |
6 files changed, 1149 insertions, 0 deletions
diff --git a/src/mountfsd/io.systemd.mount-file-system.policy b/src/mountfsd/io.systemd.mount-file-system.policy new file mode 100644 index 0000000..6a151eb --- /dev/null +++ b/src/mountfsd/io.systemd.mount-file-system.policy @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*--> +<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN" + "https://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd"> + +<!-- + SPDX-License-Identifier: LGPL-2.1-or-later + + This file is part of systemd. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +--> + +<policyconfig> + + <vendor>The systemd Project</vendor> + <vendor_url>https://systemd.io</vendor_url> + + <!-- Allow mounting DDIs into the host user namespace --> + <action id="io.systemd.mount-file-system.mount-image"> + <!-- This action is generally checked first: we'll first try to mount the image with + signature checks on. If that fails, we'll retry with the untrusted action below. --> + <description gettext-domain="systemd">Allow mounting of file system image</description> + <message gettext-domain="systemd">Authentication is required for an application to mount a file system image.</message> + <defaults> + <allow_any>auth_admin_keep</allow_any> + <allow_inactive>auth_admin_keep</allow_inactive> + <allow_active>yes</allow_active> + </defaults> + </action> + + <action id="io.systemd.mount-file-system.mount-untrusted-image"> + <!-- If the image cannot be mounted via the regular action because it is not signed by a + recognized key, we'll try this action. --> + <description gettext-domain="systemd">Allow mounting of untrusted file system image</description> + <message gettext-domain="systemd">Authentication is required for an application to mount a cryptographically unsigned file system image or an image whose cryptographic signature is not recognized.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin</allow_active> + </defaults> + + <annotate key="org.freedesktop.policykit.imply">io.systemd.mount-file-system.mount-image</annotate> + </action> + + <!-- Allow mounting DDIs into a private user namespace --> + <action id="io.systemd.mount-file-system.mount-image-privately"> + <description gettext-domain="systemd">Allow private mounting of trusted file system image</description> + <message gettext-domain="systemd">Authentication is required for an application to privately mount a file system image or an image whose cryptographic signature is recognized.</message> + <defaults> + <allow_any>yes</allow_any> + <allow_inactive>yes</allow_inactive> + <allow_active>yes</allow_active> + </defaults> + </action> + + <action id="io.systemd.mount-file-system.mount-untrusted-image-privately"> + <description gettext-domain="systemd">Allow private mounting of untrusted file system image</description> + <message gettext-domain="systemd">Authentication is required for an application to privately mount a cryptographically unsigned file system image or an image whose cryptographic signature is not recognized.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin</allow_active> + </defaults> + + <annotate key="org.freedesktop.policykit.imply">io.systemd.mount-file-system.mount-image-privately</annotate> + </action> +</policyconfig> diff --git a/src/mountfsd/meson.build b/src/mountfsd/meson.build new file mode 100644 index 0000000..53fc3d3 --- /dev/null +++ b/src/mountfsd/meson.build @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_mountwork_sources = files( + 'mountwork.c', +) + +systemd_mountfsd_sources = files( + 'mountfsd.c', + 'mountfsd-manager.c', +) + +executables += [ + libexec_template + { + 'name' : 'systemd-mountfsd', + 'conditions' : ['ENABLE_MOUNTFSD'], + 'sources' : systemd_mountfsd_sources, + }, + libexec_template + { + 'name' : 'systemd-mountwork', + 'conditions' : ['ENABLE_MOUNTFSD'], + 'sources' : systemd_mountwork_sources, + }, +] + +install_data('io.systemd.mount-file-system.policy', + install_dir : polkitpolicydir) diff --git a/src/mountfsd/mountfsd-manager.c b/src/mountfsd/mountfsd-manager.c new file mode 100644 index 0000000..b05c6e8 --- /dev/null +++ b/src/mountfsd/mountfsd-manager.c @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include <sys/wait.h> + +#include "sd-daemon.h" + +#include "build-path.h" +#include "common-signal.h" +#include "env-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "mountfsd-manager.h" +#include "process-util.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "umask-util.h" + +#define LISTEN_TIMEOUT_USEC (25 * USEC_PER_SEC) + +static int start_workers(Manager *m, bool explicit_request); + +static size_t manager_current_workers(Manager *m) { + assert(m); + + return set_size(m->workers_fixed) + set_size(m->workers_dynamic); +} + +static int on_worker_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + assert_se(!set_remove(m->workers_dynamic, s) != !set_remove(m->workers_fixed, s)); + sd_event_source_disable_unref(s); + + if (si->si_code == CLD_EXITED) { + if (si->si_status == EXIT_SUCCESS) + log_debug("Worker " PID_FMT " exited successfully.", si->si_pid); + else + log_warning("Worker " PID_FMT " died with a failure exit status %i, ignoring.", si->si_pid, si->si_status); + } else if (si->si_code == CLD_KILLED) + log_warning("Worker " PID_FMT " was killed by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status)); + else if (si->si_code == CLD_DUMPED) + log_warning("Worker " PID_FMT " dumped core by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status)); + else + log_warning("Got unexpected exit code via SIGCHLD, ignoring."); + + (void) start_workers(m, /* explicit_request= */ false); /* Fill up workers again if we fell below the low watermark */ + return 0; +} + +static int on_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + (void) start_workers(m, /* explicit_request= */ true); /* Workers told us there's more work, let's add one more worker as long as we are below the high watermark */ + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + event_source_hash_ops, + sd_event_source, + (void (*)(const sd_event_source*, struct siphash*)) trivial_hash_func, + (int (*)(const sd_event_source*, const sd_event_source*)) trivial_compare_func, + sd_event_source_disable_unref); + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .listen_fd = -EBADF, + .worker_ratelimit = { + .interval = 5 * USEC_PER_SEC, + .burst = 50, + }, + }; + + r = sd_event_new(&m->event); + if (r < 0) + return r; + + r = sd_event_set_signal_exit(m->event, true); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, (SIGRTMIN+18)|SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL); + if (r < 0) + return r; + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + r = sd_event_set_watchdog(m->event, true); + if (r < 0) + log_debug_errno(r, "Failed to enable watchdog handling, ignoring: %m"); + + r = sd_event_add_signal(m->event, NULL, SIGUSR2|SD_EVENT_SIGNAL_PROCMASK, on_sigusr2, m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + set_free(m->workers_fixed); + set_free(m->workers_dynamic); + + /* Note: we rely on PR_DEATHSIG to kill the workers for us */ + + sd_event_unref(m->event); + + return mfree(m); +} + +static int start_one_worker(Manager *m) { + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source = NULL; + bool fixed; + pid_t pid; + int r; + + assert(m); + + fixed = set_size(m->workers_fixed) < MOUNTFS_WORKERS_MIN; + + r = safe_fork_full( + "(sd-worker)", + /* stdio_fds= */ NULL, + &m->listen_fd, 1, + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_CLOSE_ALL_FDS, + &pid); + if (r < 0) + return log_error_errno(r, "Failed to fork new worker child: %m"); + if (r == 0) { + char pids[DECIMAL_STR_MAX(pid_t)]; + /* Child */ + + if (m->listen_fd == 3) { + r = fd_cloexec(3, false); + if (r < 0) { + log_error_errno(r, "Failed to turn off O_CLOEXEC for fd 3: %m"); + _exit(EXIT_FAILURE); + } + } else { + if (dup2(m->listen_fd, 3) < 0) { /* dup2() creates with O_CLOEXEC off */ + log_error_errno(errno, "Failed to move listen fd to 3: %m"); + _exit(EXIT_FAILURE); + } + + safe_close(m->listen_fd); + } + + xsprintf(pids, PID_FMT, pid); + if (setenv("LISTEN_PID", pids, 1) < 0) { + log_error_errno(errno, "Failed to set $LISTEN_PID: %m"); + _exit(EXIT_FAILURE); + } + + if (setenv("LISTEN_FDS", "1", 1) < 0) { + log_error_errno(errno, "Failed to set $LISTEN_FDS: %m"); + _exit(EXIT_FAILURE); + } + + if (setenv("MOUNTFS_FIXED_WORKER", one_zero(fixed), 1) < 0) { + log_error_errno(errno, "Failed to set $MOUNTFS_FIXED_WORKER: %m"); + _exit(EXIT_FAILURE); + } + + r = setenv_systemd_log_level(); + if (r < 0) { + log_error_errno(r, "Failed to set $SYSTEMD_LOG_LEVEL: %m"); + _exit(EXIT_FAILURE); + } + + r = invoke_callout_binary(SYSTEMD_MOUNTWORK_PATH, STRV_MAKE("systemd-mountwork", "xxxxxxxxxxxxxxxx")); /* With some extra space rename_process() can make use of */ + log_error_errno(r, "Failed start worker process: %m"); + _exit(EXIT_FAILURE); + } + + r = sd_event_add_child(m->event, &source, pid, WEXITED, on_worker_exit, m); + if (r < 0) + return log_error_errno(r, "Failed to watch child " PID_FMT ": %m", pid); + + r = set_ensure_put( + fixed ? &m->workers_fixed : &m->workers_dynamic, + &event_source_hash_ops, + source); + if (r < 0) + return log_error_errno(r, "Failed to add child process to set: %m"); + + TAKE_PTR(source); + + return 0; +} + +static int start_workers(Manager *m, bool explicit_request) { + int r; + + assert(m); + + for (;;) { + size_t n; + + n = manager_current_workers(m); + + log_debug("%zu workers running.", n); + + if (n >= MOUNTFS_WORKERS_MIN && (!explicit_request || n >= MOUNTFS_WORKERS_MAX)) + break; + + if (!ratelimit_below(&m->worker_ratelimit)) { + /* If we keep starting workers too often, let's fail the whole daemon, something is wrong */ + sd_event_exit(m->event, EXIT_FAILURE); + + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "Worker threads requested too frequently, something is wrong."); + } + + r = start_one_worker(m); + if (r < 0) + return r; + + explicit_request = false; + } + + return 0; +} + +int manager_startup(Manager *m) { + int n; + + assert(m); + assert(m->listen_fd < 0); + + n = sd_listen_fds(false); + if (n < 0) + return log_error_errno(n, "Failed to determine number of passed file descriptors: %m"); + if (n > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected one listening fd, got %i.", n); + if (n == 1) + m->listen_fd = SD_LISTEN_FDS_START; + else { + static const union sockaddr_union sockaddr = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/io.systemd.MountFileSystem", + }; + + m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (m->listen_fd < 0) + return log_error_errno(errno, "Failed to bind on socket: %m"); + + (void) sockaddr_un_unlink(&sockaddr.un); + + WITH_UMASK(0000) + if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0) + return log_error_errno(errno, "Failed to bind socket: %m"); + + if (listen(m->listen_fd, SOMAXCONN) < 0) + return log_error_errno(errno, "Failed to listen on socket: %m"); + } + + /* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be + * GC'ed on idle */ + if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0) + return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m"); + + return start_workers(m, /* explicit_request= */ false); +} diff --git a/src/mountfsd/mountfsd-manager.h b/src/mountfsd/mountfsd-manager.h new file mode 100644 index 0000000..6bfbddc --- /dev/null +++ b/src/mountfsd/mountfsd-manager.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-event.h" + +typedef struct Manager Manager; + +#include "hashmap.h" +#include "ratelimit.h" + +#define MOUNTFS_WORKERS_MIN 3 +#define MOUNTFS_WORKERS_MAX 4096 + +struct Manager { + sd_event *event; + + Set *workers_fixed; /* Workers 0…MOUNTFS_WORKERS_MIN */ + Set *workers_dynamic; /* Workers MOUNTFS_WORKERS_MIN+1…MOUNTFS_WORKERS_MAX */ + + int listen_fd; + + RateLimit worker_ratelimit; +}; + +int manager_new(Manager **ret); +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_startup(Manager *m); diff --git a/src/mountfsd/mountfsd.c b/src/mountfsd/mountfsd.c new file mode 100644 index 0000000..6073bd5 --- /dev/null +++ b/src/mountfsd/mountfsd.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/stat.h> +#include <sys/types.h> + +#include "daemon-util.h" +#include "log.h" +#include "main-func.h" +#include "mountfsd-manager.h" +#include "signal-util.h" + +static int run(int argc, char *argv[]) { + _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + log_setup(); + + umask(0022); + + if (argc != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments."); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Could not create manager: %m"); + + r = manager_startup(m); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + + notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/mountfsd/mountwork.c b/src/mountfsd/mountwork.c new file mode 100644 index 0000000..1d218a6 --- /dev/null +++ b/src/mountfsd/mountwork.c @@ -0,0 +1,703 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-daemon.h" + +#include "argv-util.h" +#include "bus-polkit.h" +#include "chase.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "main-func.h" +#include "missing_loop.h" +#include "namespace-util.h" +#include "nsresource.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "user-util.h" +#include "varlink.h" +#include "varlink-io.systemd.MountFileSystem.h" + +#define ITERATIONS_MAX 64U +#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE) +#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC) +#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC) + +static const ImagePolicy image_policy_untrusted = { + .n_policies = 2, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT }, + { PARTITION_USR, PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +static int json_dispatch_image_policy(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + _cleanup_(image_policy_freep) ImagePolicy *q = NULL; + ImagePolicy **p = ASSERT_PTR(userdata); + int r; + + assert(p); + + if (json_variant_is_null(variant)) { + *p = image_policy_free(*p); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + r = image_policy_from_string(json_variant_string(variant), &q); + if (r < 0) + return json_log(variant, flags, r, "JSON field '%s' is not a valid image policy.", strna(name)); + + image_policy_free(*p); + *p = TAKE_PTR(q); + return 0; +} + +typedef struct MountImageParameters { + unsigned image_fd_idx; + unsigned userns_fd_idx; + int read_only; + int growfs; + char *password; + ImagePolicy *image_policy; +} MountImageParameters; + +static void mount_image_parameters_done(MountImageParameters *p) { + assert(p); + + p->password = erase_and_free(p->password); + p->image_policy = image_policy_free(p->image_policy); +} + +static int validate_image_fd(int fd, MountImageParameters *p) { + int r, fl; + + assert(fd >= 0); + assert(p); + + r = fd_verify_regular(fd); + if (r < 0) + return r; + + fl = fd_verify_safe_flags(fd); + if (fl < 0) + return log_debug_errno(fl, "Image file descriptor has unsafe flags set: %m"); + + switch (fl & O_ACCMODE) { + + case O_RDONLY: + p->read_only = true; + break; + + case O_RDWR: + break; + + default: + return -EBADF; + } + + return 0; +} + +static int verify_trusted_image_fd_by_path(int fd) { + _cleanup_free_ char *p = NULL; + struct stat sta; + int r; + + assert(fd >= 0); + + r = secure_getenv_bool("SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES"); + if (r == -ENXIO) { + if (!DEFAULT_MOUNTFSD_TRUSTED_DIRECTORIES) { + log_debug("Trusted directory mechanism disabled at compile time."); + return false; + } + } else if (r < 0) { + log_debug_errno(r, "Failed to parse $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable, not trusting any image."); + return false; + } else if (!r) { + log_debug("Trusted directory mechanism disabled via $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable."); + return false; + } + + r = fd_get_path(fd, &p); + if (r < 0) + return log_debug_errno(r, "Failed to get path of passed image file descriptor: %m"); + if (fstat(fd, &sta) < 0) + return log_debug_errno(errno, "Failed to stat() passed image file descriptor: %m"); + + log_debug("Checking if image '%s' is in trusted directories.", p); + + for (ImageClass c = 0; c < _IMAGE_CLASS_MAX; c++) + NULSTR_FOREACH(s, image_search_path[c]) { + _cleanup_close_ int dir_fd = -EBADF, inode_fd = -EBADF; + _cleanup_free_ char *q = NULL; + struct stat stb; + const char *e; + + r = chase(s, NULL, CHASE_SAFE, &q, &dir_fd); + if (r == -ENOENT) + continue; + if (r < 0) { + log_warning_errno(r, "Failed to resolve search path '%s', ignoring: %m", s); + continue; + } + + /* Check that the inode refers to a file immediately inside the image directory, + * i.e. not the image directory itself, and nothing further down the tree */ + e = path_startswith(p, q); + if (isempty(e)) + continue; + + e += strspn(e, "/"); + if (!filename_is_valid(e)) + continue; + + r = chaseat(dir_fd, e, CHASE_SAFE, NULL, &inode_fd); + if (r < 0) + return log_error_errno(r, "Couldn't verify that specified image '%s' is in search path '%s': %m", p, s); + + if (fstat(inode_fd, &stb) < 0) + return log_error_errno(errno, "Failed to stat image file '%s/%s': %m", q, e); + + if (stat_inode_same(&sta, &stb)) { + log_debug("Image '%s' is *in* trusted directories.", p); + return true; /* Yay */ + } + } + + log_debug("Image '%s' is *not* in trusted directories.", p); + return false; +} + +static int determine_image_policy( + int image_fd, + bool trusted, + ImagePolicy *client_policy, + ImagePolicy **ret) { + + _cleanup_(image_policy_freep) ImagePolicy *envvar_policy = NULL; + const ImagePolicy *default_policy; + const char *envvar, *e; + int r; + + assert(image_fd >= 0); + assert(ret); + + if (trusted) { + envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_TRUSTED"; + default_policy = &image_policy_allow; + } else { + envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_UNTRUSTED"; + default_policy = &image_policy_untrusted; + } + + e = secure_getenv(envvar); + if (e) { + r = image_policy_from_string(e, &envvar_policy); + if (r < 0) + return log_error_errno(r, "Failed to parse image policy supplied via $%s: %m", envvar); + + default_policy = envvar_policy; + } + + return image_policy_intersect(default_policy, client_policy, ret); +} + +static int validate_userns(Varlink *link, int *userns_fd) { + int r; + + assert(link); + assert(userns_fd); + + if (*userns_fd < 0) + return 0; + + r = fd_verify_safe_flags(*userns_fd); + if (r < 0) + return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m"); + + r = fd_is_ns(*userns_fd, CLONE_NEWUSER); + if (r < 0) + return r; + if (r == 0) + return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor"); + + /* Our own host user namespace? Then close the fd, and handle it as if none was specified. */ + r = is_our_namespace(*userns_fd, NAMESPACE_USER); + if (r < 0) + return log_debug_errno(r, "Failed to determine if user namespace provided by client is our own."); + if (r > 0) { + log_debug("User namespace provided by client is our own."); + *userns_fd = safe_close(*userns_fd); + } + + return 0; +} + +static int vl_method_mount_image( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "imageFileDescriptor", JSON_VARIANT_UNSIGNED, json_dispatch_uint, offsetof(MountImageParameters, image_fd_idx), JSON_MANDATORY }, + { "userNamespaceFileDescriptor", JSON_VARIANT_UNSIGNED, json_dispatch_uint, offsetof(MountImageParameters, userns_fd_idx), 0 }, + { "readOnly", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(MountImageParameters, read_only), 0 }, + { "growFileSystems", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(MountImageParameters, growfs), 0 }, + { "password", JSON_VARIANT_STRING, json_dispatch_string, offsetof(MountImageParameters, password), 0 }, + { "imagePolicy", JSON_VARIANT_STRING, json_dispatch_image_policy, offsetof(MountImageParameters, image_policy), 0 }, + VARLINK_DISPATCH_POLKIT_FIELD, + {} + }; + + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + _cleanup_(mount_image_parameters_done) MountImageParameters p = { + .image_fd_idx = UINT_MAX, + .userns_fd_idx = UINT_MAX, + .read_only = -1, + .growfs = -1, + }; + _cleanup_(dissected_image_unrefp) DissectedImage *di = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *aj = NULL; + _cleanup_close_ int image_fd = -EBADF, userns_fd = -EBADF; + _cleanup_(image_policy_freep) ImagePolicy *use_policy = NULL; + Hashmap **polkit_registry = ASSERT_PTR(userdata); + _cleanup_free_ char *ps = NULL; + bool image_is_trusted = false; + uid_t peer_uid; + int r; + + assert(link); + assert(parameters); + + json_variant_sensitive(parameters); /* might contain passwords */ + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) + return log_debug_errno(r, "Failed to get client UID: %m"); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (p.image_fd_idx != UINT_MAX) { + image_fd = varlink_peek_dup_fd(link, p.image_fd_idx); + if (image_fd < 0) + return log_debug_errno(image_fd, "Failed to peek image fd from client: %m"); + } + + if (p.userns_fd_idx != UINT_MAX) { + userns_fd = varlink_peek_dup_fd(link, p.userns_fd_idx); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m"); + } + + r = validate_image_fd(image_fd, &p); + if (r < 0) + return r; + + r = validate_userns(link, &userns_fd); + if (r != 0) + return r; + + r = verify_trusted_image_fd_by_path(image_fd); + if (r < 0) + return r; + image_is_trusted = r; + + const char *polkit_details[] = { + "read_only", one_zero(p.read_only > 0), + NULL, + }; + + const char *polkit_action, *polkit_untrusted_action; + PolkitFlags polkit_flags; + if (userns_fd < 0) { + /* Mount into the host user namespace */ + polkit_action = "io.systemd.mount-file-system.mount-image"; + polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image"; + polkit_flags = 0; + } else { + /* Mount into a private user namespace */ + polkit_action = "io.systemd.mount-file-system.mount-image-privately"; + polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image-privately"; + + /* If polkit is not around, let's allow mounting authenticated images by default */ + polkit_flags = POLKIT_DEFAULT_ALLOW; + } + + /* Let's definitely acquire the regular action privilege, for mounting properly signed images */ + r = varlink_verify_polkit_async_full( + link, + /* bus= */ NULL, + polkit_action, + polkit_details, + /* good_user= */ UID_INVALID, + polkit_flags, + polkit_registry); + if (r <= 0) + return r; + + /* Generate the common dissection directory here. We are not going to use it, but the clients might, + * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it + * here, if it is missing. */ + r = get_common_dissect_directory(NULL); + if (r < 0) + return r; + + r = loop_device_make( + image_fd, + p.read_only == 0 ? O_RDONLY : O_RDWR, + 0, + UINT64_MAX, + UINT32_MAX, + LO_FLAGS_PARTSCAN, + LOCK_EX, + &loop); + if (r < 0) + return r; + + DissectImageFlags dissect_flags = + (p.read_only == 0 ? DISSECT_IMAGE_READ_ONLY : 0) | + (p.growfs != 0 ? DISSECT_IMAGE_GROWFS : 0) | + DISSECT_IMAGE_DISCARD_ANY | + DISSECT_IMAGE_FSCK | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES | + DISSECT_IMAGE_ALLOW_USERSPACE_VERITY; + + /* Let's see if we have acquired the privilege to mount untrusted images already */ + bool polkit_have_untrusted_action = + varlink_has_polkit_action(link, polkit_untrusted_action, polkit_details, polkit_registry); + + for (;;) { + use_policy = image_policy_free(use_policy); + ps = mfree(ps); + + /* We use the image policy for trusted images if either the path is below a trusted + * directory, or if we have already acquired a PK authentication that tells us that untrusted + * images are OK */ + bool use_trusted_policy = + image_is_trusted || + polkit_have_untrusted_action; + + r = determine_image_policy( + image_fd, + use_trusted_policy, + p.image_policy, + &use_policy); + if (r < 0) + return r; + + r = image_policy_to_string(use_policy, /* simplify= */ true, &ps); + if (r < 0) + return r; + + log_debug("Using image policy: %s", ps); + + r = dissect_loop_device( + loop, + &verity, + /* mount_options= */ NULL, + use_policy, + dissect_flags, + &di); + if (r == -ENOPKG) + return varlink_error(link, "io.systemd.MountFileSystem.IncompatibleImage", NULL); + if (r == -ENOTUNIQ) + return varlink_error(link, "io.systemd.MountFileSystem.MultipleRootPartitionsFound", NULL); + if (r == -ENXIO) + return varlink_error(link, "io.systemd.MountFileSystem.RootPartitionNotFound", NULL); + if (r == -ERFKILL) { + /* The image policy refused this, let's retry after trying to get PolicyKit */ + + if (!polkit_have_untrusted_action) { + log_debug("Denied by image policy. Trying a stronger polkit authentication before continuing."); + r = varlink_verify_polkit_async_full( + link, + /* bus= */ NULL, + polkit_untrusted_action, + polkit_details, + /* good_user= */ UID_INVALID, + /* flags= */ 0, /* NB: the image cannot be authenticated, hence unless PK is around to allow this anyway, fail! */ + polkit_registry); + if (r <= 0 && !ERRNO_IS_NEG_PRIVILEGE(r)) + return r; + if (r > 0) { + /* Try again, now that we know the client has enough privileges. */ + log_debug("Denied by image policy, retrying after polkit authentication."); + polkit_have_untrusted_action = true; + continue; + } + } + + return varlink_error(link, "io.systemd.MountFileSystem.DeniedByImagePolicy", NULL); + } + if (r < 0) + return r; + + /* Success */ + break; + } + + r = dissected_image_load_verity_sig_partition( + di, + loop->fd, + &verity); + if (r < 0) + return r; + + r = dissected_image_decrypt( + di, + p.password, + &verity, + dissect_flags); + if (r == -ENOKEY) /* new dm-verity userspace returns ENOKEY if the dm-verity signature key is not in + * key chain. That's great. */ + return varlink_error(link, "io.systemd.MountFileSystem.KeyNotFound", NULL); + if (r == -EBUSY) /* DM kernel subsystem is shit with returning useful errors hence we keep retrying + * under the assumption that some errors are transitional. Which the errors might + * not actually be. After all retries failed we return EBUSY. Let's turn that into a + * generic Verity error. It's not very helpful, could mean anything, but at least it + * gives client a clear idea that this has to do with Verity. */ + return varlink_error(link, "io.systemd.MountFileSystem.VerityFailure", NULL); + if (r < 0) + return r; + + r = dissected_image_mount( + di, + /* where= */ NULL, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + userns_fd, + dissect_flags); + if (r < 0) + return r; + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + _cleanup_(json_variant_unrefp) JsonVariant *pj = NULL; + DissectedPartition *pp = di->partitions + d; + int fd_idx; + + if (!pp->found) + continue; + + if (pp->fsmount_fd < 0) + continue; + + if (userns_fd >= 0) { + r = nsresource_add_mount(userns_fd, pp->fsmount_fd); + if (r < 0) + return r; + } + + fd_idx = varlink_push_fd(link, pp->fsmount_fd); + if (fd_idx < 0) + return fd_idx; + + TAKE_FD(pp->fsmount_fd); + + r = json_build(&pj, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("designator", JSON_BUILD_STRING(partition_designator_to_string(d))), + JSON_BUILD_PAIR("writable", JSON_BUILD_BOOLEAN(pp->rw)), + JSON_BUILD_PAIR("growFileSystem", JSON_BUILD_BOOLEAN(pp->growfs)), + JSON_BUILD_PAIR_CONDITION(pp->partno > 0, "partitionNumber", JSON_BUILD_INTEGER(pp->partno)), + JSON_BUILD_PAIR_CONDITION(pp->architecture > 0, "architecture", JSON_BUILD_STRING(architecture_to_string(pp->architecture))), + JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(pp->uuid), "partitionUuid", JSON_BUILD_UUID(pp->uuid)), + JSON_BUILD_PAIR("fileSystemType", JSON_BUILD_STRING(dissected_partition_fstype(pp))), + JSON_BUILD_PAIR_CONDITION(pp->label, "partitionLabel", JSON_BUILD_STRING(pp->label)), + JSON_BUILD_PAIR("size", JSON_BUILD_INTEGER(pp->size)), + JSON_BUILD_PAIR("offset", JSON_BUILD_INTEGER(pp->offset)), + JSON_BUILD_PAIR("mountFileDescriptor", JSON_BUILD_INTEGER(fd_idx)))); + if (r < 0) + return r; + + r = json_variant_append_array(&aj, pj); + if (r < 0) + return r; + } + + loop_device_relinquish(loop); + + r = varlink_replyb(link, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("partitions", JSON_BUILD_VARIANT(aj)), + JSON_BUILD_PAIR("imagePolicy", JSON_BUILD_STRING(ps)), + JSON_BUILD_PAIR("imageSize", JSON_BUILD_INTEGER(di->image_size)), + JSON_BUILD_PAIR("sectorSize", JSON_BUILD_INTEGER(di->sector_size)), + JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(di->image_uuid), "imageUuid", JSON_BUILD_UUID(di->image_uuid)))); + if (r < 0) + return r; + + return r; +} + +static int process_connection(VarlinkServer *server, int _fd) { + _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */ + _cleanup_(varlink_close_unrefp) Varlink *vl = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int r; + + r = sd_event_new(&event); + if (r < 0) + return r; + + r = varlink_server_attach_event(server, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach Varlink server to event loop: %m"); + + r = varlink_server_add_connection(server, fd, &vl); + if (r < 0) + return log_error_errno(r, "Failed to add connection: %m"); + + TAKE_FD(fd); + vl = varlink_ref(vl); + + r = varlink_set_allow_fd_passing_input(vl, true); + if (r < 0) + return log_error_errno(r, "Failed to enable fd passing for read: %m"); + + r = varlink_set_allow_fd_passing_output(vl, true); + if (r < 0) + return log_error_errno(r, "Failed to enable fd passing for write: %m"); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + r = varlink_server_detach_event(server); + if (r < 0) + return log_error_errno(r, "Failed to detach Varlink server from event loop: %m"); + + return 0; +} + +static int run(int argc, char *argv[]) { + usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY; + _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL; + _cleanup_(hashmap_freep) Hashmap *polkit_registry = NULL; + _cleanup_(pidref_done) PidRef parent = PIDREF_NULL; + unsigned n_iterations = 0; + int m, listen_fd, r; + + log_setup(); + + m = sd_listen_fds(false); + if (m < 0) + return log_error_errno(m, "Failed to determine number of listening fds: %m"); + if (m == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received."); + if (m > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time."); + + listen_fd = SD_LISTEN_FDS_START; + + r = fd_nonblock(listen_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m"); + + r = varlink_server_new(&server, VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_error_errno(r, "Failed to allocate server: %m"); + + r = varlink_server_add_interface(server, &vl_interface_io_systemd_MountFileSystem); + if (r < 0) + return log_error_errno(r, "Failed to add MountFileSystem interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + server, + "io.systemd.MountFileSystem.MountImage", vl_method_mount_image); + if (r < 0) + return log_error_errno(r, "Failed to bind methods: %m"); + + varlink_server_set_userdata(server, &polkit_registry); + + r = varlink_server_set_exit_on_idle(server, true); + if (r < 0) + return log_error_errno(r, "Failed to enable exit-on-idle mode: %m"); + + r = getenv_bool("MOUNTFS_FIXED_WORKER"); + if (r < 0) + return log_error_errno(r, "Failed to parse MOUNTFSD_FIXED_WORKER: %m"); + listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC; + + r = pidref_set_parent(&parent); + if (r < 0) + return log_error_errno(r, "Failed to acquire pidfd of parent process: %m"); + + start_time = now(CLOCK_MONOTONIC); + + for (;;) { + _cleanup_close_ int fd = -EBADF; + usec_t n; + + /* Exit the worker in regular intervals, to flush out all memory use */ + if (n_iterations++ > ITERATIONS_MAX) { + log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations); + break; + } + + n = now(CLOCK_MONOTONIC); + if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) { + log_debug("Exiting worker, ran for %s, that's enough.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0)); + break; + } + + if (last_busy_usec == USEC_INFINITY) + last_busy_usec = n; + else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) { + log_debug("Exiting worker, been idle for %s.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0)); + break; + } + + (void) rename_process("systemd-mountwork: waiting..."); + fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC)); + (void) rename_process("systemd-mountwork: processing..."); + + if (fd == -EAGAIN) + continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected + * after a while, let's check if it's time to exit though. */ + if (fd == -EINTR) + continue; /* Might be that somebody attached via strace, let's just continue in that + * case */ + if (fd < 0) + return log_error_errno(fd, "Failed to accept() from listening socket: %m"); + + if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) { + /* We only slept a very short time? If so, let's see if there are more sockets + * pending, and if so, let's ask our parent for more workers */ + + r = fd_wait_for_event(listen_fd, POLLIN, 0); + if (r < 0) + return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m"); + + if (FLAGS_SET(r, POLLIN)) { + r = pidref_kill(&parent, SIGUSR2); + if (r == -ESRCH) + return log_error_errno(r, "Parent already died?"); + if (r < 0) + return log_error_errno(r, "Failed to send SIGUSR2 signal to parent. %m"); + } + } + + (void) process_connection(server, TAKE_FD(fd)); + last_busy_usec = USEC_INFINITY; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); |