diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 13:00:47 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 13:00:47 +0000 |
commit | 2cb7e0aaedad73b076ea18c6900b0e86c5760d79 (patch) | |
tree | da68ca54bb79f4080079bf0828acda937593a4e1 /src/core | |
parent | Initial commit. (diff) | |
download | systemd-2cb7e0aaedad73b076ea18c6900b0e86c5760d79.tar.xz systemd-2cb7e0aaedad73b076ea18c6900b0e86c5760d79.zip |
Adding upstream version 247.3.upstream/247.3upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/core')
139 files changed, 81231 insertions, 0 deletions
diff --git a/src/core/all-units.h b/src/core/all-units.h new file mode 100644 index 0000000..fad814b --- /dev/null +++ b/src/core/all-units.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +#include "automount.h" +#include "device.h" +#include "path.h" +#include "scope.h" +#include "service.h" +#include "slice.h" +#include "socket.h" +#include "swap.h" +#include "target.h" +#include "timer.h" diff --git a/src/core/apparmor-setup.c b/src/core/apparmor-setup.c new file mode 100644 index 0000000..e856f5c --- /dev/null +++ b/src/core/apparmor-setup.c @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#if HAVE_APPARMOR +# include <sys/apparmor.h> +#endif +#include <unistd.h> + +#include "apparmor-setup.h" +#include "apparmor-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" + +#if HAVE_APPARMOR +DEFINE_TRIVIAL_CLEANUP_FUNC(aa_policy_cache *, aa_policy_cache_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(aa_features *, aa_features_unref); +#endif + +int mac_apparmor_setup(void) { +#if HAVE_APPARMOR + int r; + _cleanup_(aa_policy_cache_unrefp) aa_policy_cache *policy_cache = NULL; + _cleanup_(aa_features_unrefp) aa_features *features = NULL; + const char *current_file; + _cleanup_free_ char *current_profile = NULL, *cache_dir_path = NULL; + + if (!mac_apparmor_use()) { + log_debug("AppArmor either not supported by the kernel or disabled."); + return 0; + } + + /* To enable LSM stacking a patch to the kernel is proposed to create a + * per-LSM subdirectory to distinguish between the LSMs. Therefore, we + * read the file from the LSM specific directory first and only if that + * fails the one from the generic directory. + */ + FOREACH_STRING(current_file, "/proc/self/attr/apparmor/current", "/proc/self/attr/current") { + r = read_one_line_file(current_file, ¤t_profile); + if (r == -ENOENT) + continue; + else if (r < 0) + log_warning_errno(r, "Failed to read current AppArmor profile from file %s, ignoring: %m", current_file); + else + break; + } + if (!current_profile) { + log_warning("Failed to get the current AppArmor profile of systemd from /proc/self/attr/apparmor/current or /proc/self/attr/current, ignoring."); + return 0; + } + if (!streq(current_profile, "unconfined")) { + log_debug("We are already confined in an AppArmor profile."); + return 0; + } + + r = aa_features_new_from_kernel(&features); + if (r < 0) { + log_warning_errno(errno, "Failed to get the AppArmor feature set from the kernel, ignoring: %m"); + return 0; + } + cache_dir_path = aa_policy_cache_dir_path_preview(features, AT_FDCWD, "/etc/apparmor/earlypolicy"); + if (!cache_dir_path) { + log_debug_errno(errno, "Failed to get the path of the early AppArmor policy cache directory."); + return 0; + } + + /* aa_policy_cache_new will internally use the same path as aa_policy_cache_dir_path_preview has returned. */ + r = aa_policy_cache_new(&policy_cache, features, AT_FDCWD, "/etc/apparmor/earlypolicy", 0); + if (r < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "The early AppArmor policy cache directory %s does not exist.", cache_dir_path); + return 0; + } + log_warning_errno(errno, "Failed to create a new AppArmor policy cache, ignoring: %m"); + return 0; + } + r = aa_policy_cache_replace_all(policy_cache, NULL); + if (r < 0) { + log_warning_errno(errno, "Failed to load the profiles from the early AppArmor policy cache directory %s, ignoring: %m", cache_dir_path); + return 0; + } + + log_info("Successfully loaded all binary profiles from AppArmor early policy cache at %s.", cache_dir_path); + + r = aa_change_profile("systemd"); + if (r < 0) { + if (errno == ENOENT) + log_debug_errno(errno, "Failed to change to AppArmor profile 'systemd'. Please ensure that one of the binary profile files in policy cache directory %s contains a profile with that name.", cache_dir_path); + else + log_error_errno(errno, "Failed to change to AppArmor profile 'systemd': %m"); + return 0; + } + + log_info("Changed to AppArmor profile systemd."); +#endif + return 0; +} diff --git a/src/core/apparmor-setup.h b/src/core/apparmor-setup.h new file mode 100644 index 0000000..f3b7382 --- /dev/null +++ b/src/core/apparmor-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int mac_apparmor_setup(void); diff --git a/src/core/audit-fd.c b/src/core/audit-fd.c new file mode 100644 index 0000000..097bea3 --- /dev/null +++ b/src/core/audit-fd.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> + +#include "audit-fd.h" + +#if HAVE_AUDIT + +#include <libaudit.h> +#include <stdbool.h> + +#include "capability-util.h" +#include "fd-util.h" +#include "log.h" +#include "util.h" + +static bool initialized = false; +static int audit_fd; + +int get_audit_fd(void) { + + if (!initialized) { + if (have_effective_cap(CAP_AUDIT_WRITE) == 0) { + audit_fd = -EPERM; + initialized = true; + + return audit_fd; + } + + audit_fd = audit_open(); + + if (audit_fd < 0) { + if (!IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT)) + log_error_errno(errno, "Failed to connect to audit log: %m"); + + audit_fd = errno ? -errno : -EINVAL; + } + + initialized = true; + } + + return audit_fd; +} + +void close_audit_fd(void) { + + if (initialized && audit_fd >= 0) + safe_close(audit_fd); + + initialized = true; + audit_fd = -ECONNRESET; +} + +#else + +int get_audit_fd(void) { + return -EAFNOSUPPORT; +} + +void close_audit_fd(void) { +} + +#endif diff --git a/src/core/audit-fd.h b/src/core/audit-fd.h new file mode 100644 index 0000000..5cdf61e --- /dev/null +++ b/src/core/audit-fd.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int get_audit_fd(void); +void close_audit_fd(void); diff --git a/src/core/automount.c b/src/core/automount.c new file mode 100644 index 0000000..a84cddb --- /dev/null +++ b/src/core/automount.c @@ -0,0 +1,1135 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/auto_dev-ioctl.h> +#include <linux/auto_fs4.h> +#include <sys/epoll.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "async.h" +#include "automount.h" +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-unit.h" +#include "fd-util.h" +#include "format-util.h" +#include "io-util.h" +#include "label.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = { + [AUTOMOUNT_DEAD] = UNIT_INACTIVE, + [AUTOMOUNT_WAITING] = UNIT_ACTIVE, + [AUTOMOUNT_RUNNING] = UNIT_ACTIVE, + [AUTOMOUNT_FAILED] = UNIT_FAILED +}; + +struct expire_data { + int dev_autofs_fd; + int ioctl_fd; +}; + +static void expire_data_free(struct expire_data *data) { + if (!data) + return; + + safe_close(data->dev_autofs_fd); + safe_close(data->ioctl_fd); + free(data); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free); + +static int open_dev_autofs(Manager *m); +static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata); +static int automount_start_expire(Automount *a); +static void automount_stop_expire(Automount *a); +static int automount_send_ready(Automount *a, Set *tokens, int status); + +static void automount_init(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + a->pipe_fd = -1; + a->directory_mode = 0755; + UNIT(a)->ignore_on_isolate = true; +} + +static void unmount_autofs(Automount *a) { + int r; + + assert(a); + + if (a->pipe_fd < 0) + return; + + a->pipe_event_source = sd_event_source_unref(a->pipe_event_source); + a->pipe_fd = safe_close(a->pipe_fd); + + /* If we reload/reexecute things we keep the mount point around */ + if (!IN_SET(UNIT(a)->manager->objective, MANAGER_RELOAD, MANAGER_REEXECUTE)) { + + automount_send_ready(a, a->tokens, -EHOSTDOWN); + automount_send_ready(a, a->expire_tokens, -EHOSTDOWN); + + if (a->where) { + r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + log_error_errno(r, "Failed to unmount: %m"); + } + } +} + +static void automount_done(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + unmount_autofs(a); + + a->where = mfree(a->where); + + a->tokens = set_free(a->tokens); + a->expire_tokens = set_free(a->expire_tokens); + + a->expire_event_source = sd_event_source_unref(a->expire_event_source); +} + +static int automount_add_trigger_dependencies(Automount *a) { + Unit *x; + int r; + + assert(a); + + r = unit_load_related_unit(UNIT(a), ".mount", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(a), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int automount_add_mount_dependencies(Automount *a) { + _cleanup_free_ char *parent = NULL; + + assert(a); + + parent = dirname_malloc(a->where); + if (!parent) + return -ENOMEM; + + return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT); +} + +static int automount_add_default_dependencies(Automount *a) { + int r; + + assert(a); + + if (!UNIT(a)->default_dependencies) + return 0; + + if (!MANAGER_IS_SYSTEM(UNIT(a)->manager)) + return 0; + + r = unit_add_dependency_by_name(UNIT(a), UNIT_BEFORE, SPECIAL_LOCAL_FS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + r = unit_add_dependency_by_name(UNIT(a), UNIT_AFTER, SPECIAL_LOCAL_FS_PRE_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(UNIT(a), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int automount_verify(Automount *a) { + _cleanup_free_ char *e = NULL; + int r; + + assert(a); + assert(UNIT(a)->load_state == UNIT_LOADED); + + if (path_equal(a->where, "/")) { + log_unit_error(UNIT(a), "Cannot have an automount unit for the root directory. Refusing."); + return -ENOEXEC; + } + + r = unit_name_from_path(a->where, ".automount", &e); + if (r < 0) + return log_unit_error_errno(UNIT(a), r, "Failed to generate unit name from path: %m"); + + if (!unit_has_name(UNIT(a), e)) { + log_unit_error(UNIT(a), "Where= setting doesn't match unit name. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int automount_set_where(Automount *a) { + int r; + + assert(a); + + if (a->where) + return 0; + + r = unit_name_to_path(UNIT(a)->id, &a->where); + if (r < 0) + return r; + + path_simplify(a->where, false); + return 1; +} + +static int automount_add_extras(Automount *a) { + int r; + + r = automount_set_where(a); + if (r < 0) + return r; + + r = automount_add_trigger_dependencies(a); + if (r < 0) + return r; + + r = automount_add_mount_dependencies(a); + if (r < 0) + return r; + + return automount_add_default_dependencies(a); +} + +static int automount_load(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + /* Load a .automount file */ + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + r = automount_add_extras(a); + if (r < 0) + return r; + + return automount_verify(a); +} + +static void automount_set_state(Automount *a, AutomountState state) { + AutomountState old_state; + assert(a); + + if (a->state != state) + bus_unit_send_pending_change_signal(UNIT(a), false); + + old_state = a->state; + a->state = state; + + if (state != AUTOMOUNT_RUNNING) + automount_stop_expire(a); + + if (!IN_SET(state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) + unmount_autofs(a); + + if (state != old_state) + log_unit_debug(UNIT(a), "Changed %s -> %s", automount_state_to_string(old_state), automount_state_to_string(state)); + + unit_notify(UNIT(a), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int automount_coldplug(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(a->state == AUTOMOUNT_DEAD); + + if (a->deserialized_state == a->state) + return 0; + + if (IN_SET(a->deserialized_state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) { + + r = automount_set_where(a); + if (r < 0) + return r; + + r = open_dev_autofs(u->manager); + if (r < 0) + return r; + + assert(a->pipe_fd >= 0); + + r = sd_event_add_io(u->manager->event, &a->pipe_event_source, a->pipe_fd, EPOLLIN, automount_dispatch_io, u); + if (r < 0) + return r; + + (void) sd_event_source_set_description(a->pipe_event_source, "automount-io"); + if (a->deserialized_state == AUTOMOUNT_RUNNING) { + r = automount_start_expire(a); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m"); + } + + automount_set_state(a, a->deserialized_state); + } + + return 0; +} + +static void automount_dump(Unit *u, FILE *f, const char *prefix) { + char time_string[FORMAT_TIMESPAN_MAX]; + Automount *a = AUTOMOUNT(u); + + assert(a); + + fprintf(f, + "%sAutomount State: %s\n" + "%sResult: %s\n" + "%sWhere: %s\n" + "%sDirectoryMode: %04o\n" + "%sTimeoutIdleUSec: %s\n", + prefix, automount_state_to_string(a->state), + prefix, automount_result_to_string(a->result), + prefix, a->where, + prefix, a->directory_mode, + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, a->timeout_idle_usec, USEC_PER_SEC)); +} + +static void automount_enter_dead(Automount *a, AutomountResult f) { + assert(a); + + if (a->result == AUTOMOUNT_SUCCESS) + a->result = f; + + unit_log_result(UNIT(a), a->result == AUTOMOUNT_SUCCESS, automount_result_to_string(a->result)); + automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD); +} + +static int open_dev_autofs(Manager *m) { + struct autofs_dev_ioctl param; + + assert(m); + + if (m->dev_autofs_fd >= 0) + return m->dev_autofs_fd; + + (void) label_fix("/dev/autofs", 0); + + m->dev_autofs_fd = open("/dev/autofs", O_CLOEXEC|O_RDONLY); + if (m->dev_autofs_fd < 0) + return log_error_errno(errno, "Failed to open /dev/autofs: %m"); + + init_autofs_dev_ioctl(¶m); + if (ioctl(m->dev_autofs_fd, AUTOFS_DEV_IOCTL_VERSION, ¶m) < 0) { + m->dev_autofs_fd = safe_close(m->dev_autofs_fd); + return -errno; + } + + log_debug("Autofs kernel version %i.%i", param.ver_major, param.ver_minor); + + return m->dev_autofs_fd; +} + +static int open_ioctl_fd(int dev_autofs_fd, const char *where, dev_t devid) { + struct autofs_dev_ioctl *param; + size_t l; + + assert(dev_autofs_fd >= 0); + assert(where); + + l = sizeof(struct autofs_dev_ioctl) + strlen(where) + 1; + param = alloca(l); + + init_autofs_dev_ioctl(param); + param->size = l; + param->ioctlfd = -1; + param->openmount.devid = devid; + strcpy(param->path, where); + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0) + return -errno; + + if (param->ioctlfd < 0) + return -EIO; + + (void) fd_cloexec(param->ioctlfd, true); + return param->ioctlfd; +} + +static int autofs_protocol(int dev_autofs_fd, int ioctl_fd) { + uint32_t major, minor; + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOVER, ¶m) < 0) + return -errno; + + major = param.protover.version; + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOSUBVER, ¶m) < 0) + return -errno; + + minor = param.protosubver.sub_version; + + log_debug("Autofs protocol version %i.%i", major, minor); + return 0; +} + +static int autofs_set_timeout(int dev_autofs_fd, int ioctl_fd, usec_t usec) { + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (usec == USEC_INFINITY) + param.timeout.timeout = 0; + else + /* Convert to seconds, rounding up. */ + param.timeout.timeout = DIV_ROUND_UP(usec, USEC_PER_SEC); + + if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_TIMEOUT, ¶m) < 0) + return -errno; + + return 0; +} + +static int autofs_send_ready(int dev_autofs_fd, int ioctl_fd, uint32_t token, int status) { + struct autofs_dev_ioctl param; + + assert(dev_autofs_fd >= 0); + assert(ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (status != 0) { + param.fail.token = token; + param.fail.status = status; + } else + param.ready.token = token; + + if (ioctl(dev_autofs_fd, status ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, ¶m) < 0) + return -errno; + + return 0; +} + +static int automount_send_ready(Automount *a, Set *tokens, int status) { + _cleanup_close_ int ioctl_fd = -1; + unsigned token; + int r; + + assert(a); + assert(status <= 0); + + if (set_isempty(tokens)) + return 0; + + ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (ioctl_fd < 0) + return ioctl_fd; + + if (status != 0) + log_unit_debug_errno(UNIT(a), status, "Sending failure: %m"); + else + log_unit_debug(UNIT(a), "Sending success."); + + r = 0; + + /* Autofs thankfully does not hand out 0 as a token */ + while ((token = PTR_TO_UINT(set_steal_first(tokens)))) { + int k; + + /* Autofs fun fact: + * + * if you pass a positive status code here, kernels + * prior to 4.12 will freeze! Yay! */ + + k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd, + ioctl_fd, + token, + status); + if (k < 0) + r = k; + } + + return r; +} + +static void automount_trigger_notify(Unit *u, Unit *other) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(other); + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + assert(other->type == UNIT_MOUNT); + + /* Don't propagate state changes from the mount if we are already down */ + if (!IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + automount_enter_dead(a, AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + /* The mount is successfully established */ + if (IN_SET(MOUNT(other)->state, MOUNT_MOUNTED, MOUNT_REMOUNTING)) { + (void) automount_send_ready(a, a->tokens, 0); + + r = automount_start_expire(a); + if (r < 0) + log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m"); + + automount_set_state(a, AUTOMOUNT_RUNNING); + } + + if (IN_SET(MOUNT(other)->state, + MOUNT_MOUNTING, MOUNT_MOUNTING_DONE, + MOUNT_MOUNTED, MOUNT_REMOUNTING, + MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL, + MOUNT_FAILED)) + (void) automount_send_ready(a, a->expire_tokens, -ENODEV); + + if (MOUNT(other)->state == MOUNT_DEAD) + (void) automount_send_ready(a, a->expire_tokens, 0); + + /* The mount is in some unhappy state now, let's unfreeze any waiting clients */ + if (IN_SET(MOUNT(other)->state, + MOUNT_DEAD, MOUNT_UNMOUNTING, + MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL, + MOUNT_FAILED)) { + + (void) automount_send_ready(a, a->tokens, -ENODEV); + + automount_set_state(a, AUTOMOUNT_WAITING); + } +} + +static void automount_enter_waiting(Automount *a) { + _cleanup_close_ int ioctl_fd = -1; + int p[2] = { -1, -1 }; + char name[STRLEN("systemd-") + DECIMAL_STR_MAX(pid_t) + 1]; + char options[STRLEN("fd=,pgrp=,minproto=5,maxproto=5,direct") + + DECIMAL_STR_MAX(int) + DECIMAL_STR_MAX(gid_t) + 1]; + bool mounted = false; + int r, dev_autofs_fd; + struct stat st; + + assert(a); + assert(a->pipe_fd < 0); + assert(a->where); + + set_clear(a->tokens); + + r = unit_fail_if_noncanonical(UNIT(a), a->where); + if (r < 0) + goto fail; + + (void) mkdir_p_label(a->where, a->directory_mode); + + unit_warn_if_dir_nonempty(UNIT(a), a->where); + + dev_autofs_fd = open_dev_autofs(UNIT(a)->manager); + if (dev_autofs_fd < 0) { + r = dev_autofs_fd; + goto fail; + } + + if (pipe2(p, O_CLOEXEC) < 0) { + r = -errno; + goto fail; + } + r = fd_nonblock(p[0], true); + if (r < 0) + goto fail; + + xsprintf(options, "fd=%i,pgrp="PID_FMT",minproto=5,maxproto=5,direct", p[1], getpgrp()); + xsprintf(name, "systemd-"PID_FMT, getpid_cached()); + r = mount_nofollow(name, a->where, "autofs", 0, options); + if (r < 0) + goto fail; + + mounted = true; + + p[1] = safe_close(p[1]); + + if (stat(a->where, &st) < 0) { + r = -errno; + goto fail; + } + + ioctl_fd = open_ioctl_fd(dev_autofs_fd, a->where, st.st_dev); + if (ioctl_fd < 0) { + r = ioctl_fd; + goto fail; + } + + r = autofs_protocol(dev_autofs_fd, ioctl_fd); + if (r < 0) + goto fail; + + r = autofs_set_timeout(dev_autofs_fd, ioctl_fd, a->timeout_idle_usec); + if (r < 0) + goto fail; + + r = sd_event_add_io(UNIT(a)->manager->event, &a->pipe_event_source, p[0], EPOLLIN, automount_dispatch_io, a); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(a->pipe_event_source, "automount-io"); + + a->pipe_fd = p[0]; + a->dev_id = st.st_dev; + + automount_set_state(a, AUTOMOUNT_WAITING); + + return; + +fail: + log_unit_error_errno(UNIT(a), r, "Failed to initialize automounter: %m"); + + safe_close_pair(p); + + if (mounted) { + r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + log_error_errno(r, "Failed to unmount, ignoring: %m"); + } + + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); +} + +static void *expire_thread(void *p) { + struct autofs_dev_ioctl param; + _cleanup_(expire_data_freep) struct expire_data *data = (struct expire_data*)p; + int r; + + assert(data->dev_autofs_fd >= 0); + assert(data->ioctl_fd >= 0); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = data->ioctl_fd; + + do { + r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m); + } while (r >= 0); + + if (errno != EAGAIN) + log_warning_errno(errno, "Failed to expire automount, ignoring: %m"); + + return NULL; +} + +static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) { + Automount *a = AUTOMOUNT(userdata); + _cleanup_(expire_data_freep) struct expire_data *data = NULL; + int r; + + assert(a); + assert(source == a->expire_event_source); + + data = new0(struct expire_data, 1); + if (!data) + return log_oom(); + + data->ioctl_fd = -1; + + data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3); + if (data->dev_autofs_fd < 0) + return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m"); + + data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); + if (data->ioctl_fd < 0) + return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m"); + + r = asynchronous_job(expire_thread, data); + if (r < 0) + return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m"); + + data = NULL; + + return automount_start_expire(a); +} + +static int automount_start_expire(Automount *a) { + usec_t timeout; + int r; + + assert(a); + + if (a->timeout_idle_usec == 0) + return 0; + + timeout = MAX(a->timeout_idle_usec/3, USEC_PER_SEC); + + if (a->expire_event_source) { + r = sd_event_source_set_time_relative(a->expire_event_source, timeout); + if (r < 0) + return r; + + return sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_ONESHOT); + } + + r = sd_event_add_time_relative( + UNIT(a)->manager->event, + &a->expire_event_source, + CLOCK_MONOTONIC, timeout, 0, + automount_dispatch_expire, a); + if (r < 0) + return r; + + (void) sd_event_source_set_description(a->expire_event_source, "automount-expire"); + + return 0; +} + +static void automount_stop_expire(Automount *a) { + assert(a); + + if (!a->expire_event_source) + return; + + (void) sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_OFF); +} + +static void automount_enter_running(Automount *a) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + struct stat st; + int r; + + assert(a); + + /* If the user masked our unit in the meantime, fail */ + if (UNIT(a)->load_state != UNIT_LOADED) { + log_unit_error(UNIT(a), "Suppressing automount event since unit is no longer loaded."); + goto fail; + } + + /* We don't take mount requests anymore if we are supposed to + * shut down anyway */ + if (unit_stop_pending(UNIT(a))) { + log_unit_debug(UNIT(a), "Suppressing automount request since unit stop is scheduled."); + automount_send_ready(a, a->tokens, -EHOSTDOWN); + automount_send_ready(a, a->expire_tokens, -EHOSTDOWN); + return; + } + + (void) mkdir_p_label(a->where, a->directory_mode); + + /* Before we do anything, let's see if somebody is playing games with us? */ + if (lstat(a->where, &st) < 0) { + log_unit_warning_errno(UNIT(a), errno, "Failed to stat automount point: %m"); + goto fail; + } + + /* The mount unit may have been explicitly started before we got the + * autofs request. Ack it to unblock anything waiting on the mount point. */ + if (!S_ISDIR(st.st_mode) || st.st_dev != a->dev_id) { + log_unit_info(UNIT(a), "Automount point already active?"); + automount_send_ready(a, a->tokens, 0); + return; + } + + trigger = UNIT_TRIGGER(UNIT(a)); + if (!trigger) { + log_unit_error(UNIT(a), "Unit to trigger vanished."); + goto fail; + } + + r = manager_add_job(UNIT(a)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(a), "Failed to queue mount startup job: %s", bus_error_message(&error, r)); + goto fail; + } + + automount_set_state(a, AUTOMOUNT_RUNNING); + return; + +fail: + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); +} + +static int automount_start(Unit *u) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED)); + + if (path_is_mount_point(a->where, NULL, 0) > 0) { + log_unit_error(u, "Path %s is already a mount point, refusing start.", a->where); + return -EEXIST; + } + + r = unit_test_trigger_loaded(u); + if (r < 0) + return r; + + r = unit_test_start_limit(u); + if (r < 0) { + automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + a->result = AUTOMOUNT_SUCCESS; + automount_enter_waiting(a); + return 1; +} + +static int automount_stop(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)); + + automount_enter_dead(a, AUTOMOUNT_SUCCESS); + return 1; +} + +static int automount_serialize(Unit *u, FILE *f, FDSet *fds) { + Automount *a = AUTOMOUNT(u); + void *p; + int r; + + assert(a); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", automount_state_to_string(a->state)); + (void) serialize_item(f, "result", automount_result_to_string(a->result)); + (void) serialize_item_format(f, "dev-id", "%lu", (unsigned long) a->dev_id); + + SET_FOREACH(p, a->tokens) + (void) serialize_item_format(f, "token", "%u", PTR_TO_UINT(p)); + SET_FOREACH(p, a->expire_tokens) + (void) serialize_item_format(f, "expire-token", "%u", PTR_TO_UINT(p)); + + r = serialize_fd(f, fds, "pipe-fd", a->pipe_fd); + if (r < 0) + return r; + + return 0; +} + +static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Automount *a = AUTOMOUNT(u); + int r; + + assert(a); + assert(fds); + + if (streq(key, "state")) { + AutomountState state; + + state = automount_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + a->deserialized_state = state; + } else if (streq(key, "result")) { + AutomountResult f; + + f = automount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != AUTOMOUNT_SUCCESS) + a->result = f; + + } else if (streq(key, "dev-id")) { + unsigned long d; + + if (safe_atolu(value, &d) < 0) + log_unit_debug(u, "Failed to parse dev-id value: %s", value); + else + a->dev_id = (dev_t) d; + + } else if (streq(key, "token")) { + unsigned token; + + if (safe_atou(value, &token) < 0) + log_unit_debug(u, "Failed to parse token value: %s", value); + else { + r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(token)); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add token to set: %m"); + } + } else if (streq(key, "expire-token")) { + unsigned token; + + if (safe_atou(value, &token) < 0) + log_unit_debug(u, "Failed to parse token value: %s", value); + else { + r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(token)); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add expire token to set: %m"); + } + } else if (streq(key, "pipe-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse pipe-fd value: %s", value); + else { + safe_close(a->pipe_fd); + a->pipe_fd = fdset_remove(fds, fd); + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static UnitActiveState automount_active_state(Unit *u) { + assert(u); + + return state_translation_table[AUTOMOUNT(u)->state]; +} + +static const char *automount_sub_state_to_string(Unit *u) { + assert(u); + + return automount_state_to_string(AUTOMOUNT(u)->state); +} + +static bool automount_may_gc(Unit *u) { + Unit *t; + + assert(u); + + t = UNIT_TRIGGER(u); + if (!t) + return true; + + return UNIT_VTABLE(t)->may_gc(t); +} + +static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + union autofs_v5_packet_union packet; + Automount *a = AUTOMOUNT(userdata); + Unit *trigger; + int r; + + assert(a); + assert(fd == a->pipe_fd); + + if (events & (EPOLLHUP|EPOLLERR)) { + log_unit_error(UNIT(a), "Got hangup/error on autofs pipe from kernel. Likely our automount point has been unmounted by someone or something else?"); + automount_enter_dead(a, AUTOMOUNT_FAILURE_UNMOUNTED); + return 0; + } + + if (events != EPOLLIN) { + log_unit_error(UNIT(a), "Got invalid poll event %"PRIu32" on pipe (fd=%d)", events, fd); + goto fail; + } + + r = loop_read_exact(a->pipe_fd, &packet, sizeof(packet), true); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Invalid read from pipe: %m"); + goto fail; + } + + switch (packet.hdr.type) { + + case autofs_ptype_missing_direct: + + if (packet.v5_packet.pid > 0) { + _cleanup_free_ char *p = NULL; + + (void) get_process_comm(packet.v5_packet.pid, &p); + log_unit_info(UNIT(a), "Got automount request for %s, triggered by %"PRIu32" (%s)", a->where, packet.v5_packet.pid, strna(p)); + } else + log_unit_debug(UNIT(a), "Got direct mount request on %s", a->where); + + r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token)); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m"); + goto fail; + } + + automount_enter_running(a); + break; + + case autofs_ptype_expire_direct: + log_unit_debug(UNIT(a), "Got direct umount request on %s", a->where); + + automount_stop_expire(a); + + r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token)); + if (r < 0) { + log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m"); + goto fail; + } + + trigger = UNIT_TRIGGER(UNIT(a)); + if (!trigger) { + log_unit_error(UNIT(a), "Unit to trigger vanished."); + goto fail; + } + + r = manager_add_job(UNIT(a)->manager, JOB_STOP, trigger, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) { + log_unit_warning(UNIT(a), "Failed to queue umount startup job: %s", bus_error_message(&error, r)); + goto fail; + } + break; + + default: + log_unit_error(UNIT(a), "Received unknown automount request %i", packet.hdr.type); + break; + } + + return 0; + +fail: + automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES); + return 0; +} + +static void automount_shutdown(Manager *m) { + assert(m); + + m->dev_autofs_fd = safe_close(m->dev_autofs_fd); +} + +static void automount_reset_failed(Unit *u) { + Automount *a = AUTOMOUNT(u); + + assert(a); + + if (a->state == AUTOMOUNT_FAILED) + automount_set_state(a, AUTOMOUNT_DEAD); + + a->result = AUTOMOUNT_SUCCESS; +} + +static bool automount_supported(void) { + static int supported = -1; + + if (supported < 0) + supported = access("/dev/autofs", F_OK) >= 0; + + return supported; +} + +static const char* const automount_result_table[_AUTOMOUNT_RESULT_MAX] = { + [AUTOMOUNT_SUCCESS] = "success", + [AUTOMOUNT_FAILURE_RESOURCES] = "resources", + [AUTOMOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT] = "mount-start-limit-hit", + [AUTOMOUNT_FAILURE_UNMOUNTED] = "unmounted", +}; + +DEFINE_STRING_TABLE_LOOKUP(automount_result, AutomountResult); + +const UnitVTable automount_vtable = { + .object_size = sizeof(Automount), + + .sections = + "Unit\0" + "Automount\0" + "Install\0", + .private_section = "Automount", + + .can_transient = true, + .can_fail = true, + .can_trigger = true, + + .init = automount_init, + .load = automount_load, + .done = automount_done, + + .coldplug = automount_coldplug, + + .dump = automount_dump, + + .start = automount_start, + .stop = automount_stop, + + .serialize = automount_serialize, + .deserialize_item = automount_deserialize_item, + + .active_state = automount_active_state, + .sub_state_to_string = automount_sub_state_to_string, + + .may_gc = automount_may_gc, + + .trigger_notify = automount_trigger_notify, + + .reset_failed = automount_reset_failed, + + .bus_set_property = bus_automount_set_property, + + .shutdown = automount_shutdown, + .supported = automount_supported, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Set up automount %s.", + [JOB_FAILED] = "Failed to set up automount %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Unset automount %s.", + [JOB_FAILED] = "Failed to unset automount %s.", + }, + }, +}; diff --git a/src/core/automount.h b/src/core/automount.h new file mode 100644 index 0000000..fe668d9 --- /dev/null +++ b/src/core/automount.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Automount Automount; + +#include "unit.h" + +typedef enum AutomountResult { + AUTOMOUNT_SUCCESS, + AUTOMOUNT_FAILURE_RESOURCES, + AUTOMOUNT_FAILURE_UNMOUNTED, + AUTOMOUNT_FAILURE_START_LIMIT_HIT, + AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT, + _AUTOMOUNT_RESULT_MAX, + _AUTOMOUNT_RESULT_INVALID = -1 +} AutomountResult; + +struct Automount { + Unit meta; + + AutomountState state, deserialized_state; + + char *where; + usec_t timeout_idle_usec; + + int pipe_fd; + sd_event_source *pipe_event_source; + mode_t directory_mode; + dev_t dev_id; + + Set *tokens; + Set *expire_tokens; + + sd_event_source *expire_event_source; + + AutomountResult result; +}; + +extern const UnitVTable automount_vtable; + +const char* automount_result_to_string(AutomountResult i) _const_; +AutomountResult automount_result_from_string(const char *s) _pure_; + +DEFINE_CAST(AUTOMOUNT, Automount); diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c new file mode 100644 index 0000000..1ad7ade --- /dev/null +++ b/src/core/bpf-devices.c @@ -0,0 +1,529 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fnmatch.h> +#include <linux/bpf_insn.h> + +#include "bpf-devices.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "fileio.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" + +#define PASS_JUMP_OFF 4096 + +static int bpf_access_type(const char *acc) { + int r = 0; + + assert(acc); + + for (; *acc; acc++) + switch(*acc) { + case 'r': + r |= BPF_DEVCG_ACC_READ; + break; + case 'w': + r |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + r |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return r; +} + +static int bpf_prog_allow_list_device( + BPFProgram *prog, + char type, + int major, + int minor, + const char *acc) { + + int r, access; + + assert(prog); + assert(acc); + + log_trace("%s: %c %d:%d %s", __func__, type, major, minor, acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 4), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 3), /* compare device type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +static int bpf_prog_allow_list_major( + BPFProgram *prog, + char type, + int major, + const char *acc) { + + int r, access; + + assert(prog); + assert(acc); + + log_trace("%s: %c %d:* %s", __func__, type, major, acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 2), /* compare device type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +static int bpf_prog_allow_list_class( + BPFProgram *prog, + char type, + const char *acc) { + + int r, access; + + assert(prog); + assert(acc); + + log_trace("%s: %c *:* %s", __func__, type, acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 1), /* compare device type */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int bpf_devices_cgroup_init( + BPFProgram **ret, + CGroupDevicePolicy policy, + bool allow_list) { + + const struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + int r; + + assert(ret); + + if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list) + return 0; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &prog); + if (r < 0) + return log_error_errno(r, "Loading device control BPF program failed: %m"); + + if (policy == CGROUP_DEVICE_POLICY_CLOSED || allow_list) { + r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + } + + *ret = TAKE_PTR(prog); + + return 0; +} + +int bpf_devices_apply_policy( + BPFProgram *prog, + CGroupDevicePolicy policy, + bool allow_list, + const char *cgroup_path, + BPFProgram **prog_installed) { + + _cleanup_free_ char *controller_path = NULL; + int r; + + /* This will assign *keep_program if everything goes well. */ + + if (!prog) + goto finish; + + const bool deny_everything = policy == CGROUP_DEVICE_POLICY_STRICT && !allow_list; + + const struct bpf_insn post_insn[] = { + /* return DENY */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + }; + + const struct bpf_insn exit_insn[] = { + /* finally return DENY if deny_everything else ALLOW */ + BPF_MOV64_IMM(BPF_REG_0, deny_everything ? 0 : 1), + BPF_EXIT_INSN() + }; + + if (!deny_everything) { + r = bpf_program_add_instructions(prog, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + /* Fixup PASS_JUMP_OFF jump offsets. */ + for (size_t off = 0; off < prog->n_instructions; off++) { + struct bpf_insn *ins = &prog->instructions[off]; + + if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF) + ins->off = prog->n_instructions - off - 1; + } + } + + r = bpf_program_add_instructions(prog, exit_insn, ELEMENTSOF(exit_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, NULL, &controller_path); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup path: %m"); + + r = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE, controller_path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", + cgroup_path); + + finish: + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */ + if (prog_installed) { + bpf_program_unref(*prog_installed); + *prog_installed = bpf_program_ref(prog); + } + return 0; +} + +int bpf_devices_supported(void) { + const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL; + static int supported = -1; + int r; + + /* Checks whether BPF device controller is supported. For this, we check five things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) { + log_debug("Not enough privileges, BPF device control is not supported."); + return supported = 0; + } + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + log_debug("Not running with unified cgroups, BPF device control is not supported."); + return supported = 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &program); + if (r < 0) { + log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + return supported = 1; +} + +static int allow_list_device_pattern( + BPFProgram *prog, + const char *path, + char type, + const unsigned *maj, + const unsigned *min, + const char *acc) { + + assert(IN_SET(type, 'b', 'c')); + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + if (maj && min) + return bpf_prog_allow_list_device(prog, type, *maj, *min, acc); + else if (maj) + return bpf_prog_allow_list_major(prog, type, *maj, acc); + else + return bpf_prog_allow_list_class(prog, type, acc); + + } else { + char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4]; + int r; + + if (maj && min) + xsprintf(buf, "%c %u:%u %s", type, *maj, *min, acc); + else if (maj) + xsprintf(buf, "%c %u:* %s", type, *maj, acc); + else + xsprintf(buf, "%c *:* %s", type, acc); + + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore + * EINVAL here. */ + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + + return r; + } +} + +int bpf_devices_allow_list_device( + BPFProgram *prog, + const char *path, + const char *node, + const char *acc) { + + mode_t mode; + dev_t rdev; + int r; + + assert(path); + assert(acc); + assert(strlen(acc) <= 3); + + log_trace("%s: %s %s", __func__, node, acc); + + /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and + * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This + * means clients can use these path without the device node actually around */ + r = device_path_parse_major_minor(node, &mode, &rdev); + if (r < 0) { + if (r != -ENODEV) + return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); + + struct stat st; + if (stat(node, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device %s: %m", node); + + if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) + return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node); + + mode = st.st_mode; + rdev = (dev_t) st.st_rdev; + } + + unsigned maj = major(rdev), min = minor(rdev); + return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, acc); +} + +int bpf_devices_allow_list_major( + BPFProgram *prog, + const char *path, + const char *name, + char type, + const char *acc) { + + unsigned maj; + int r; + + assert(path); + assert(acc); + assert(IN_SET(type, 'b', 'c')); + + if (streq(name, "*")) + /* If the name is a wildcard, then apply this list to all devices of this type */ + return allow_list_device_pattern(prog, path, type, NULL, NULL, acc); + + if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) + /* The name is numeric and suitable as major. In that case, let's take its major, and create + * the entry directly. */ + return allow_list_device_pattern(prog, path, type, &maj, NULL, acc); + + _cleanup_fclose_ FILE *f = NULL; + bool good = false, any = false; + + f = fopen("/proc/devices", "re"); + if (!f) + return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s: %m", name); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *w, *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/devices: %m"); + if (r == 0) + break; + + if (type == 'c' && streq(line, "Character devices:")) { + good = true; + continue; + } + + if (type == 'b' && streq(line, "Block devices:")) { + good = true; + continue; + } + + if (isempty(line)) { + good = false; + continue; + } + + if (!good) + continue; + + p = strstrip(line); + + w = strpbrk(p, WHITESPACE); + if (!w) + continue; + *w = 0; + + r = safe_atou(p, &maj); + if (r < 0) + continue; + if (maj <= 0) + continue; + + w++; + w += strspn(w, WHITESPACE); + + if (fnmatch(name, w, 0) != 0) + continue; + + any = true; + (void) allow_list_device_pattern(prog, path, type, &maj, NULL, acc); + } + + if (!any) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "Device allow list pattern \"%s\" did not match anything.", name); + + return 0; +} + +int bpf_devices_allow_list_static( + BPFProgram *prog, + const char *path) { + + static const char auto_devices[] = + "/dev/null\0" "rwm\0" + "/dev/zero\0" "rwm\0" + "/dev/full\0" "rwm\0" + "/dev/random\0" "rwm\0" + "/dev/urandom\0" "rwm\0" + "/dev/tty\0" "rwm\0" + "/dev/ptmx\0" "rwm\0" + /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */ + "/run/systemd/inaccessible/chr\0" "rwm\0" + "/run/systemd/inaccessible/blk\0" "rwm\0"; + int r = 0, k; + + const char *node, *acc; + NULSTR_FOREACH_PAIR(node, acc, auto_devices) { + k = bpf_devices_allow_list_device(prog, path, node, acc); + if (r >= 0 && k < 0) + r = k; + } + + /* PTS (/dev/pts) devices may not be duplicated, but accessed */ + k = bpf_devices_allow_list_major(prog, path, "pts", 'c', "rw"); + if (r >= 0 && k < 0) + r = k; + + return r; +} diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h new file mode 100644 index 0000000..19b4d39 --- /dev/null +++ b/src/core/bpf-devices.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> + +#include "cgroup.h" + +typedef struct BPFProgram BPFProgram; + +int bpf_devices_cgroup_init(BPFProgram **ret, CGroupDevicePolicy policy, bool allow_list); +int bpf_devices_apply_policy( + BPFProgram *prog, + CGroupDevicePolicy policy, + bool allow_list, + const char *cgroup_path, + BPFProgram **prog_installed); + +int bpf_devices_supported(void); +int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, const char *acc); +int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc); +int bpf_devices_allow_list_static(BPFProgram *prog, const char *path); diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c new file mode 100644 index 0000000..99783ac --- /dev/null +++ b/src/core/bpf-firewall.c @@ -0,0 +1,911 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/bpf_insn.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "ip-address-access.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "unit.h" +#include "strv.h" +#include "virt.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +enum { + ACCESS_ALLOWED = 1, + ACCESS_DENIED = 2, +}; + +/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */ + +static int add_lookup_instructions( + BPFProgram *p, + int map_fd, + int protocol, + bool is_ingress, + int verdict) { + + int r, addr_offset, addr_size; + + assert(p); + assert(map_fd >= 0); + + switch (protocol) { + + case ETH_P_IP: + addr_size = sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct iphdr, saddr) : + offsetof(struct iphdr, daddr); + break; + + case ETH_P_IPV6: + addr_size = 4 * sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct ip6_hdr, ip6_src.s6_addr) : + offsetof(struct ip6_hdr, ip6_dst.s6_addr); + break; + + default: + return -EAFNOSUPPORT; + } + + do { + /* Compare IPv4 with one word instruction (32bit) */ + struct bpf_insn insn[] = { + /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0), + + /* + * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address + * + * R1: Pointer to the skb + * R2: Data offset + * R3: Destination buffer on the stack (r10 - 4) + * R4: Number of bytes to read (4) + */ + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV32_IMM(BPF_REG_2, addr_offset), + + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size), + + BPF_MOV32_IMM(BPF_REG_4, addr_size), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + + /* + * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the + * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key' + * has to be set to the maximum possible value. + * + * On success, the looked up value is stored in R0. For this application, the actual + * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any + * matching value. + */ + + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)), + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8), + + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + + } while (false); + + return 0; +} + +static int add_instructions_for_ip_any( + BPFProgram *p, + int verdict) { + int r; + + assert(p); + + const struct bpf_insn insn[] = { + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + r = bpf_program_add_instructions(p, insn, 1); + if (r < 0) + return r; + + return 0; +} + +static int bpf_firewall_compile_bpf( + Unit *u, + bool is_ingress, + BPFProgram **ret, + bool ip_allow_any, + bool ip_deny_any) { + + const struct bpf_insn pre_insn[] = { + /* + * When the eBPF program is entered, R1 contains the address of the skb. + * However, R1-R5 are scratch registers that are not preserved when calling + * into kernel functions, so we need to save anything that's supposed to + * stay around to R6-R9. Save the skb to R6. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* + * Although we cannot access the skb data directly from eBPF programs used in this + * scenario, the kernel has prepared some fields for us to access through struct __sk_buff. + * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7 + * for later use. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)), + + /* + * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet + * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning. + */ + BPF_MOV32_IMM(BPF_REG_8, 0), + }; + + /* + * The access checkers compiled for the configured allowance and denial lists + * write to R8 at runtime. The following code prepares for an early exit that + * skip the accounting if the packet is denied. + * + * R0 = 1 + * if (R8 == ACCESS_DENIED) + * R0 = 0 + * + * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet + * is allowed to pass. + */ + const struct bpf_insn post_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; + int accounting_map_fd, r; + bool access_enabled; + + assert(u); + assert(ret); + + accounting_map_fd = is_ingress ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + + access_enabled = + u->ipv4_allow_map_fd >= 0 || + u->ipv6_allow_map_fd >= 0 || + u->ipv4_deny_map_fd >= 0 || + u->ipv6_deny_map_fd >= 0 || + ip_allow_any || + ip_deny_any; + + if (accounting_map_fd < 0 && !access_enabled) { + *ret = NULL; + return 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p); + if (r < 0) + return r; + + r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return r; + + if (access_enabled) { + /* + * The simple rule this function translates into eBPF instructions is: + * + * - Access will be granted when an address matches an entry in @list_allow + * - Otherwise, access will be denied when an address matches an entry in @list_deny + * - Otherwise, access will be granted + */ + + if (u->ipv4_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv6_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv4_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (u->ipv6_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (ip_allow_any) { + r = add_instructions_for_ip_any(p, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (ip_deny_any) { + r = add_instructions_for_ip_any(p, ACCESS_DENIED); + if (r < 0) + return r; + } + } + + r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return r; + + if (accounting_map_fd >= 0) { + struct bpf_insn insn[] = { + /* + * If R0 == 0, the packet will be denied; skip the accounting instructions in this case. + * The jump label will be fixed up later. + */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0), + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Allow the packet to pass */ + BPF_MOV64_IMM(BPF_REG_0, 1), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } + + do { + /* + * Exit from the eBPF program, R0 contains the verdict. + * 0 means the packet is denied, 1 means the packet may pass. + */ + const struct bpf_insn insn[] = { + BPF_EXIT_INSN() + }; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } while (false); + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) { + IPAddressAccessItem *a; + + assert(n_ipv4); + assert(n_ipv6); + + LIST_FOREACH(items, a, list) { + switch (a->family) { + + case AF_INET: + (*n_ipv4)++; + break; + + case AF_INET6: + (*n_ipv6)++; + break; + + default: + return -EAFNOSUPPORT; + } + } + + return 0; +} + +static int bpf_firewall_add_access_items( + IPAddressAccessItem *list, + int ipv4_map_fd, + int ipv6_map_fd, + int verdict) { + + struct bpf_lpm_trie_key *key_ipv4, *key_ipv6; + uint64_t value = verdict; + IPAddressAccessItem *a; + int r; + + key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)); + key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4); + + LIST_FOREACH(items, a, list) { + switch (a->family) { + + case AF_INET: + key_ipv4->prefixlen = a->prefixlen; + memcpy(key_ipv4->data, &a->address, sizeof(uint32_t)); + + r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value); + if (r < 0) + return r; + + break; + + case AF_INET6: + key_ipv6->prefixlen = a->prefixlen; + memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t)); + + r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value); + if (r < 0) + return r; + + break; + + default: + return -EAFNOSUPPORT; + } + } + + return 0; +} + +static int bpf_firewall_prepare_access_maps( + Unit *u, + int verdict, + int *ret_ipv4_map_fd, + int *ret_ipv6_map_fd, + bool *ret_has_any) { + + _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1; + size_t n_ipv4 = 0, n_ipv6 = 0; + IPAddressAccessItem *list; + Unit *p; + int r; + + assert(ret_ipv4_map_fd); + assert(ret_ipv6_map_fd); + assert(ret_has_any); + + for (p = u; p; p = UNIT_DEREF(p->slice)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny; + + bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6); + + /* Skip making the LPM trie map in cases where we are using "any" in order to hack around + * needing CAP_SYS_ADMIN for allocating LPM trie map. */ + if (ip_address_access_item_is_any(list)) { + *ret_has_any = true; + return 0; + } + } + + if (n_ipv4 > 0) { + ipv4_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t), + sizeof(uint64_t), + n_ipv4, + BPF_F_NO_PREALLOC); + if (ipv4_map_fd < 0) + return ipv4_map_fd; + } + + if (n_ipv6 > 0) { + ipv6_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4, + sizeof(uint64_t), + n_ipv6, + BPF_F_NO_PREALLOC); + if (ipv6_map_fd < 0) + return ipv6_map_fd; + } + + for (p = u; p; p = UNIT_DEREF(p->slice)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, + ipv4_map_fd, ipv6_map_fd, verdict); + if (r < 0) + return r; + } + + *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd); + *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd); + *ret_has_any = false; + return 0; +} + +static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) { + int r; + + assert(u); + assert(fd_ingress); + assert(fd_egress); + + if (enabled) { + if (*fd_ingress < 0) { + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_ingress = r; + } + + if (*fd_egress < 0) { + + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_egress = r; + } + + } else { + *fd_ingress = safe_close(*fd_ingress); + *fd_egress = safe_close(*fd_egress); + + zero(u->ip_accounting_extra); + } + + return 0; +} + +int bpf_firewall_compile(Unit *u) { + CGroupContext *cc; + int r, supported; + bool ip_allow_any = false, ip_deny_any = false; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "BPF firewalling not supported on this manager, proceeding without."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) + /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice + * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption + * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is + * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at + * all, either. */ + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units."); + + /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves, + * but we reuse the accounting maps. That way the firewall in effect always maps to the actual + * configuration, but we don't flush out the accounting unnecessarily */ + + u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress); + u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + if (u->type != UNIT_SLICE) { + /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf + * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that + * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this + * means that all configure IP access rules *will* take effect on processes, even though we never + * compile them for inner nodes. */ + + r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any); + if (r < 0) + return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m"); + + r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m"); + } + + r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd); + if (r < 0) + return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m"); + + r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m"); + + r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m"); + + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref); + +static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) { + char **bpf_fs_path; + + set_clear(*set); + + STRV_FOREACH(bpf_fs_path, filter_paths) { + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + int r; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog); + if (r < 0) + return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m"); + + r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path); + if (r < 0) + return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path); + + r = set_ensure_consume(set, &filter_prog_hash_ops, TAKE_PTR(prog)); + if (r < 0) + return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m"); + } + + return 0; +} + +int bpf_firewall_load_custom(Unit *u) { + CGroupContext *cc; + int r, supported; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + if (!(cc->ip_filters_ingress || cc->ip_filters_egress)) + return 0; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs."); + + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress); + if (r < 0) + return r; + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress); + if (r < 0) + return r; + + return 0; +} + +static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) { + BPFProgram *prog; + int r; + + assert(u); + + set_clear(*set_installed); + + SET_FOREACH(prog, *set) { + r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path); + + /* Remember that these BPF programs are installed now. */ + r = set_ensure_put(set_installed, &filter_prog_hash_ops, prog); + if (r < 0) + return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m"); + bpf_program_ref(prog); + } + + return 0; +} + +int bpf_firewall_install(Unit *u) { + _cleanup_free_ char *path = NULL; + CGroupContext *cc; + int r, supported; + uint32_t flags; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + if (!u->cgroup_path) + return -EINVAL; + if (!u->cgroup_realized) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) { + log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without."); + return -EOPNOTSUPP; + } + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) { + log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units."); + return -EOPNOTSUPP; + } + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && + (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress))) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs."); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m"); + + flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI && + (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0; + + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to + * minimize the time window when we don't account for IP traffic. */ + u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed); + u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed); + + if (u->ip_bpf_egress) { + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, + flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI)); + if (r < 0) + return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path); + + /* Remember that this BPF program is installed now. */ + u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress); + } + + if (u->ip_bpf_ingress) { + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, + flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI)); + if (r < 0) + return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path); + + u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress); + } + + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed); + if (r < 0) + return r; + + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed); + if (r < 0) + return r; + + return 0; +} + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) { + uint64_t key, packets; + int r; + + if (map_fd < 0) + return -EBADF; + + if (ret_packets) { + key = MAP_KEY_PACKETS; + r = bpf_map_lookup_element(map_fd, &key, &packets); + if (r < 0) + return r; + } + + if (ret_bytes) { + key = MAP_KEY_BYTES; + r = bpf_map_lookup_element(map_fd, &key, ret_bytes); + if (r < 0) + return r; + } + + if (ret_packets) + *ret_packets = packets; + + return 0; +} + +int bpf_firewall_reset_accounting(int map_fd) { + uint64_t key, value = 0; + int r; + + if (map_fd < 0) + return -EBADF; + + key = MAP_KEY_PACKETS; + r = bpf_map_update_element(map_fd, &key, &value); + if (r < 0) + return r; + + key = MAP_KEY_BYTES; + return bpf_map_update_element(map_fd, &key, &value); +} + +static int bpf_firewall_unsupported_reason = 0; + +int bpf_firewall_supported(void) { + const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL; + static int supported = -1; + union bpf_attr attr; + int r; + + /* Checks whether BPF firewalling is supported. For this, we check the following things: + * + * - whether the unified hierarchy is being used + * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require + * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require + */ + if (supported >= 0) + return supported; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), + "Not running with unified cgroups, BPF firewalling is not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF + * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF + * program if we can't do a thing with it later? + * + * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if + * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the + * parameters are validated however, and that'll fail with EBADF then. */ + + // FIXME: Clang doesn't 0-pad with structured initialization, causing + // the kernel to reject the bpf_attr as invalid. See: + // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65 + // Ideally it should behave like GCC, so that we can remove these workarounds. + zero(attr); + attr.attach_type = BPF_CGROUP_INET_EGRESS; + attr.target_fd = -1; + attr.attach_bpf_fd = -1; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) { + if (errno != EBADF) { + bpf_firewall_unsupported_reason = + log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* YAY! */ + } else { + log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported + * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH + * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll + * get EINVAL if it's not supported, and EBADF as before if it is available. */ + + zero(attr); + attr.attach_type = BPF_CGROUP_INET_EGRESS; + attr.target_fd = -1; + attr.attach_bpf_fd = -1; + attr.attach_flags = BPF_F_ALLOW_MULTI; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) { + if (errno == EBADF) { + log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!"); + return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI; + } + + if (errno == EINVAL) + log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported."); + else + log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m"); + + return supported = BPF_FIREWALL_SUPPORTED; + } else { + log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } +} + +void emit_bpf_firewall_warning(Unit *u) { + static bool warned = false; + + if (!warned) { + bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container() > 0; + + log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason, + "unit configures an IP firewall, but %s.\n" + "(This warning is only shown for the first unit using IP firewalling.)", + getuid() != 0 ? "not running as root" : + "the local system does not support BPF/cgroup firewalling"); + warned = true; + } +} diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h new file mode 100644 index 0000000..08d7742 --- /dev/null +++ b/src/core/bpf-firewall.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> + +#include "unit.h" + +enum { + BPF_FIREWALL_UNSUPPORTED = 0, + BPF_FIREWALL_SUPPORTED = 1, + BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2, +}; + +int bpf_firewall_supported(void); + +int bpf_firewall_compile(Unit *u); +int bpf_firewall_install(Unit *u); +int bpf_firewall_load_custom(Unit *u); + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets); +int bpf_firewall_reset_accounting(int map_fd); + +void emit_bpf_firewall_warning(Unit *u); diff --git a/src/core/cgroup.c b/src/core/cgroup.c new file mode 100644 index 0000000..7dc6c20 --- /dev/null +++ b/src/core/cgroup.c @@ -0,0 +1,3778 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "bpf-devices.h" +#include "bpf-firewall.h" +#include "btrfs-util.h" +#include "bus-error.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "limits-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "special.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "virt.h" + +#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) + +/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access + * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask + * out specific attributes from us. */ +#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING) + +uint64_t tasks_max_resolve(const TasksMax *tasks_max) { + if (tasks_max->scale == 0) + return tasks_max->value; + + return system_tasks_max_scale(tasks_max->value, tasks_max->scale); +} + +bool manager_owns_host_root_cgroup(Manager *m) { + assert(m); + + /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the + * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's + * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if + * we run in any kind of container virtualization. */ + + if (MANAGER_IS_USER(m)) + return false; + + if (detect_container() > 0) + return false; + + return empty_or_root(m->cgroup_root); +} + +bool unit_has_host_root_cgroup(Unit *u) { + assert(u); + + /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and + * the manager manages the root cgroup. */ + + if (!manager_owns_host_root_cgroup(u->manager)) + return false; + + return unit_has_name(u, SPECIAL_ROOT_SLICE); +} + +static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) { + int r; + + r = cg_set_attribute(controller, u->cgroup_path, attribute, value); + if (r < 0) + log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m", + strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value); + + return r; +} + +static void cgroup_compat_warn(void) { + static bool cgroup_compat_warned = false; + + if (cgroup_compat_warned) + return; + + log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. " + "See cgroup-compat debug messages for details."); + + cgroup_compat_warned = true; +} + +#define log_cgroup_compat(unit, fmt, ...) do { \ + cgroup_compat_warn(); \ + log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \ + } while (false) + +void cgroup_context_init(CGroupContext *c) { + assert(c); + + /* Initialize everything to the kernel defaults. */ + + *c = (CGroupContext) { + .cpu_weight = CGROUP_WEIGHT_INVALID, + .startup_cpu_weight = CGROUP_WEIGHT_INVALID, + .cpu_quota_per_sec_usec = USEC_INFINITY, + .cpu_quota_period_usec = USEC_INFINITY, + + .cpu_shares = CGROUP_CPU_SHARES_INVALID, + .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID, + + .memory_high = CGROUP_LIMIT_MAX, + .memory_max = CGROUP_LIMIT_MAX, + .memory_swap_max = CGROUP_LIMIT_MAX, + + .memory_limit = CGROUP_LIMIT_MAX, + + .io_weight = CGROUP_WEIGHT_INVALID, + .startup_io_weight = CGROUP_WEIGHT_INVALID, + + .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + + .tasks_max = TASKS_MAX_UNSET, + + .moom_swap = MANAGED_OOM_AUTO, + .moom_mem_pressure = MANAGED_OOM_AUTO, + }; +} + +void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) { + assert(c); + assert(a); + + LIST_REMOVE(device_allow, c->device_allow, a); + free(a->path); + free(a); +} + +void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) { + assert(c); + assert(w); + + LIST_REMOVE(device_weights, c->io_device_weights, w); + free(w->path); + free(w); +} + +void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) { + assert(c); + assert(l); + + LIST_REMOVE(device_latencies, c->io_device_latencies, l); + free(l->path); + free(l); +} + +void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) { + assert(c); + assert(l); + + LIST_REMOVE(device_limits, c->io_device_limits, l); + free(l->path); + free(l); +} + +void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) { + assert(c); + assert(w); + + LIST_REMOVE(device_weights, c->blockio_device_weights, w); + free(w->path); + free(w); +} + +void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) { + assert(c); + assert(b); + + LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b); + free(b->path); + free(b); +} + +void cgroup_context_done(CGroupContext *c) { + assert(c); + + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + while (c->io_device_limits) + cgroup_context_free_io_device_limit(c, c->io_device_limits); + + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + while (c->blockio_device_bandwidths) + cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths); + + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow); + c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny); + + c->ip_filters_ingress = strv_free(c->ip_filters_ingress); + c->ip_filters_egress = strv_free(c->ip_filters_egress); + + cpu_set_reset(&c->cpuset_cpus); + cpu_set_reset(&c->cpuset_mems); +} + +static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { + assert(u); + + if (!u->cgroup_realized) + return -EOWNERDEAD; + + return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret); +} + +static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) { + CGroupContext *c; + CGroupMask m; + const char *file; + uint64_t unit_value; + int r; + + /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will + * return -ENODATA) on cgroup v1. + * + * Returns: + * + * <0: On error. + * 0: If the kernel memory setting doesn't match our configuration. + * >0: If the kernel memory setting matches our configuration. + * + * The following values are only guaranteed to be populated on return >=0: + * + * - ret_unit_value will contain our internal expected value for the unit, page-aligned. + * - ret_kernel_value will contain the actual value presented by the kernel. */ + + assert(u); + + r = cg_all_unified(); + if (r < 0) + return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m"); + + /* Unsupported on v1. + * + * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has + * silently masked the controller. */ + if (r == 0) + return -ENODATA; + + /* The root slice doesn't have any controller files, so we can't compare anything. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return -ENODATA; + + /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled, + * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To + * avoid specious errors in these scenarios, check that we even expect the memory controller to be + * enabled at all. */ + m = unit_get_target_mask(u); + if (!FLAGS_SET(m, CGROUP_MASK_MEMORY)) + return -ENODATA; + + c = unit_get_cgroup_context(u); + assert(c); + + if (streq(property_name, "MemoryLow")) { + unit_value = unit_get_ancestor_memory_low(u); + file = "memory.low"; + } else if (streq(property_name, "MemoryMin")) { + unit_value = unit_get_ancestor_memory_min(u); + file = "memory.min"; + } else if (streq(property_name, "MemoryHigh")) { + unit_value = c->memory_high; + file = "memory.high"; + } else if (streq(property_name, "MemoryMax")) { + unit_value = c->memory_max; + file = "memory.max"; + } else if (streq(property_name, "MemorySwapMax")) { + unit_value = c->memory_swap_max; + file = "memory.swap.max"; + } else + return -EINVAL; + + r = unit_get_kernel_memory_limit(u, file, ret_kernel_value); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file); + + /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page + * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from + * our internal page-counting. To support those future kernels, just check the value itself first + * without any page-alignment. */ + if (*ret_kernel_value == unit_value) { + *ret_unit_value = unit_value; + return 1; + } + + /* The current kernel behaviour, by comparison, is that even if you write a particular number of + * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel + * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is + * cricket. */ + if (unit_value != CGROUP_LIMIT_MAX) + unit_value = PAGE_ALIGN_DOWN(unit_value); + + *ret_unit_value = unit_value; + + return *ret_kernel_value == *ret_unit_value; +} + +#define FORMAT_CGROUP_DIFF_MAX 128 + +static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) { + uint64_t kval, sval; + int r; + + assert(u); + assert(buf); + assert(l > 0); + + r = unit_compare_memory_limit(u, property_name, &sval, &kval); + + /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1). + * In the absence of reliably being able to detect whether memcg swap support is available or not, + * only complain if the error is not ENOENT. */ + if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) || + (r == -ENOENT && streq(property_name, "MemorySwapMax"))) { + buf[0] = 0; + return buf; + } + + if (r < 0) { + snprintf(buf, l, " (error getting kernel value: %s)", strerror_safe(r)); + return buf; + } + + snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval); + + return buf; +} + +void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + _cleanup_free_ char *disable_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL; + CGroupIODeviceLimit *il; + CGroupIODeviceWeight *iw; + CGroupIODeviceLatency *l; + CGroupBlockIODeviceBandwidth *b; + CGroupBlockIODeviceWeight *w; + CGroupDeviceAllow *a; + CGroupContext *c; + IPAddressAccessItem *iaai; + char **path; + char q[FORMAT_TIMESPAN_MAX]; + char v[FORMAT_TIMESPAN_MAX]; + + char cda[FORMAT_CGROUP_DIFF_MAX]; + char cdb[FORMAT_CGROUP_DIFF_MAX]; + char cdc[FORMAT_CGROUP_DIFF_MAX]; + char cdd[FORMAT_CGROUP_DIFF_MAX]; + char cde[FORMAT_CGROUP_DIFF_MAX]; + + assert(u); + assert(f); + + c = unit_get_cgroup_context(u); + assert(c); + + prefix = strempty(prefix); + + (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str); + + cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus); + cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems); + + fprintf(f, + "%sCPUAccounting: %s\n" + "%sIOAccounting: %s\n" + "%sBlockIOAccounting: %s\n" + "%sMemoryAccounting: %s\n" + "%sTasksAccounting: %s\n" + "%sIPAccounting: %s\n" + "%sCPUWeight: %" PRIu64 "\n" + "%sStartupCPUWeight: %" PRIu64 "\n" + "%sCPUShares: %" PRIu64 "\n" + "%sStartupCPUShares: %" PRIu64 "\n" + "%sCPUQuotaPerSecSec: %s\n" + "%sCPUQuotaPeriodSec: %s\n" + "%sAllowedCPUs: %s\n" + "%sAllowedMemoryNodes: %s\n" + "%sIOWeight: %" PRIu64 "\n" + "%sStartupIOWeight: %" PRIu64 "\n" + "%sBlockIOWeight: %" PRIu64 "\n" + "%sStartupBlockIOWeight: %" PRIu64 "\n" + "%sDefaultMemoryMin: %" PRIu64 "\n" + "%sDefaultMemoryLow: %" PRIu64 "\n" + "%sMemoryMin: %" PRIu64 "%s\n" + "%sMemoryLow: %" PRIu64 "%s\n" + "%sMemoryHigh: %" PRIu64 "%s\n" + "%sMemoryMax: %" PRIu64 "%s\n" + "%sMemorySwapMax: %" PRIu64 "%s\n" + "%sMemoryLimit: %" PRIu64 "\n" + "%sTasksMax: %" PRIu64 "\n" + "%sDevicePolicy: %s\n" + "%sDisableControllers: %s\n" + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" + "%sManagedOOMMemoryPressureLimitPercent: %d%%\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), + prefix, yes_no(c->memory_accounting), + prefix, yes_no(c->tasks_accounting), + prefix, yes_no(c->ip_accounting), + prefix, c->cpu_weight, + prefix, c->startup_cpu_weight, + prefix, c->cpu_shares, + prefix, c->startup_cpu_shares, + prefix, format_timespan(q, sizeof(q), c->cpu_quota_per_sec_usec, 1), + prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1), + prefix, strempty(cpuset_cpus), + prefix, strempty(cpuset_mems), + prefix, c->io_weight, + prefix, c->startup_io_weight, + prefix, c->blockio_weight, + prefix, c->startup_blockio_weight, + prefix, c->default_memory_min, + prefix, c->default_memory_low, + prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"), + prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"), + prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "MemoryHigh"), + prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryMax"), + prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "MemorySwapMax"), + prefix, c->memory_limit, + prefix, tasks_max_resolve(&c->tasks_max), + prefix, cgroup_device_policy_to_string(c->device_policy), + prefix, strempty(disable_controllers_str), + prefix, yes_no(c->delegate), + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), + prefix, c->moom_mem_pressure_limit); + + if (c->delegate) { + _cleanup_free_ char *t = NULL; + + (void) cg_mask_to_string(c->delegate_controllers, &t); + + fprintf(f, "%sDelegateControllers: %s\n", + prefix, + strempty(t)); + } + + LIST_FOREACH(device_allow, a, c->device_allow) + fprintf(f, + "%sDeviceAllow: %s %s%s%s\n", + prefix, + a->path, + a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : ""); + + LIST_FOREACH(device_weights, iw, c->io_device_weights) + fprintf(f, + "%sIODeviceWeight: %s %" PRIu64 "\n", + prefix, + iw->path, + iw->weight); + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) + fprintf(f, + "%sIODeviceLatencyTargetSec: %s %s\n", + prefix, + l->path, + format_timespan(q, sizeof(q), l->target_usec, 1)); + + LIST_FOREACH(device_limits, il, c->io_device_limits) { + char buf[FORMAT_BYTES_MAX]; + CGroupIOLimitType type; + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + if (il->limits[type] != cgroup_io_limit_defaults[type]) + fprintf(f, + "%s%s: %s %s\n", + prefix, + cgroup_io_limit_type_to_string(type), + il->path, + format_bytes(buf, sizeof(buf), il->limits[type])); + } + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) + fprintf(f, + "%sBlockIODeviceWeight: %s %" PRIu64, + prefix, + w->path, + w->weight); + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + char buf[FORMAT_BYTES_MAX]; + + if (b->rbps != CGROUP_LIMIT_MAX) + fprintf(f, + "%sBlockIOReadBandwidth: %s %s\n", + prefix, + b->path, + format_bytes(buf, sizeof(buf), b->rbps)); + if (b->wbps != CGROUP_LIMIT_MAX) + fprintf(f, + "%sBlockIOWriteBandwidth: %s %s\n", + prefix, + b->path, + format_bytes(buf, sizeof(buf), b->wbps)); + } + + LIST_FOREACH(items, iaai, c->ip_address_allow) { + _cleanup_free_ char *k = NULL; + + (void) in_addr_to_string(iaai->family, &iaai->address, &k); + fprintf(f, "%sIPAddressAllow: %s/%u\n", prefix, strnull(k), iaai->prefixlen); + } + + LIST_FOREACH(items, iaai, c->ip_address_deny) { + _cleanup_free_ char *k = NULL; + + (void) in_addr_to_string(iaai->family, &iaai->address, &k); + fprintf(f, "%sIPAddressDeny: %s/%u\n", prefix, strnull(k), iaai->prefixlen); + } + + STRV_FOREACH(path, c->ip_filters_ingress) + fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path); + + STRV_FOREACH(path, c->ip_filters_egress) + fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path); +} + +int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) { + _cleanup_free_ CGroupDeviceAllow *a = NULL; + _cleanup_free_ char *d = NULL; + + assert(c); + assert(dev); + assert(isempty(mode) || in_charset(mode, "rwm")); + + a = new(CGroupDeviceAllow, 1); + if (!a) + return -ENOMEM; + + d = strdup(dev); + if (!d) + return -ENOMEM; + + *a = (CGroupDeviceAllow) { + .path = TAKE_PTR(d), + .r = isempty(mode) || strchr(mode, 'r'), + .w = isempty(mode) || strchr(mode, 'w'), + .m = isempty(mode) || strchr(mode, 'm'), + }; + + LIST_PREPEND(device_allow, c->device_allow, a); + TAKE_PTR(a); + + return 0; +} + +#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \ + uint64_t unit_get_ancestor_##entry(Unit *u) { \ + CGroupContext *c; \ + \ + /* 1. Is entry set in this unit? If so, use that. \ + * 2. Is the default for this entry set in any \ + * ancestor? If so, use that. \ + * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \ + \ + assert(u); \ + \ + c = unit_get_cgroup_context(u); \ + if (c && c->entry##_set) \ + return c->entry; \ + \ + while ((u = UNIT_DEREF(u->slice))) { \ + c = unit_get_cgroup_context(u); \ + if (c && c->default_##entry##_set) \ + return c->default_##entry; \ + } \ + \ + /* We've reached the root, but nobody had default for \ + * this entry set, so set it to the kernel default. */ \ + return CGROUP_LIMIT_MIN; \ +} + +UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low); +UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min); + +static void cgroup_xattr_apply(Unit *u) { + char ids[SD_ID128_STRING_MAX]; + int r; + + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (!sd_id128_is_null(u->invocation_id)) { + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + "trusted.invocation_id", + sd_id128_to_string(u->invocation_id, ids), 32, + 0); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path); + } + + if (unit_cgroup_delegate(u)) { + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + "trusted.delegate", + "1", 1, + 0); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set delegate flag on control group %s, ignoring: %m", u->cgroup_path); + } else { + r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "trusted.delegate"); + if (r != -ENODATA) + log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", u->cgroup_path); + } +} + +static int lookup_block_device(const char *p, dev_t *ret) { + dev_t rdev, dev = 0; + mode_t mode; + int r; + + assert(p); + assert(ret); + + r = device_path_parse_major_minor(p, &mode, &rdev); + if (r == -ENODEV) { /* not a parsable device node, need to go to disk */ + struct stat st; + + if (stat(p, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device '%s': %m", p); + + mode = st.st_mode; + rdev = st.st_rdev; + dev = st.st_dev; + } else if (r < 0) + return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p); + + if (S_ISCHR(mode)) + return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK), + "Device node '%s' is a character device, but block device needed.", p); + if (S_ISBLK(mode)) + *ret = rdev; + else if (major(dev) != 0) + *ret = dev; /* If this is not a device node then use the block device this file is stored on */ + else { + /* If this is btrfs, getting the backing block device is a bit harder */ + r = btrfs_get_block_device(p, ret); + if (r == -ENOTTY) + return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), + "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p); + if (r < 0) + return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p); + } + + /* If this is a LUKS/DM device, recursively try to get the originating block device */ + while (block_get_originating(*ret, ret) > 0); + + /* If this is a partition, try to get the originating block device */ + (void) block_get_whole_disk(*ret, ret); + return 0; +} + +static bool cgroup_context_has_cpu_weight(CGroupContext *c) { + return c->cpu_weight != CGROUP_WEIGHT_INVALID || + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID; +} + +static bool cgroup_context_has_cpu_shares(CGroupContext *c) { + return c->cpu_shares != CGROUP_CPU_SHARES_INVALID || + c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID; +} + +static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_cpu_weight != CGROUP_WEIGHT_INVALID) + return c->startup_cpu_weight; + else if (c->cpu_weight != CGROUP_WEIGHT_INVALID) + return c->cpu_weight; + else + return CGROUP_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID) + return c->startup_cpu_shares; + else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID) + return c->cpu_shares; + else + return CGROUP_CPU_SHARES_DEFAULT; +} + +usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) { + /* kernel uses a minimum resolution of 1ms, so both period and (quota * period) + * need to be higher than that boundary. quota is specified in USecPerSec. + * Additionally, period must be at most max_period. */ + assert(quota > 0); + + return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period); +} + +static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) { + usec_t new_period; + + if (quota == USEC_INFINITY) + /* Always use default period for infinity quota. */ + return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; + + if (period == USEC_INFINITY) + /* Default period was requested. */ + period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; + + /* Clamp to interval [1ms, 1s] */ + new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC); + + if (new_period != period) { + char v[FORMAT_TIMESPAN_MAX]; + log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, + "Clamping CPU interval for cpu.max: period is now %s", + format_timespan(v, sizeof(v), new_period, 1)); + u->warned_clamping_cpu_quota_period = true; + } + + return new_period; +} + +static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf); +} + +static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) { + char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1]; + + period = cgroup_cpu_adjust_period_and_log(u, period, quota); + if (quota != USEC_INFINITY) + xsprintf(buf, USEC_FMT " " USEC_FMT "\n", + MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period); + else + xsprintf(buf, "max " USEC_FMT "\n", period); + (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf); +} + +static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + + xsprintf(buf, "%" PRIu64 "\n", shares); + (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf); +} + +static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) { + char buf[DECIMAL_STR_MAX(usec_t) + 2]; + + period = cgroup_cpu_adjust_period_and_log(u, period, quota); + + xsprintf(buf, USEC_FMT "\n", period); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf); + + if (quota != USEC_INFINITY) { + xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC)); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf); + } else + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n"); +} + +static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) { + return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT, + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) { + return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT, + CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX); +} + +static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) { + _cleanup_free_ char *buf = NULL; + + buf = cpu_set_to_range_string(cpus); + if (!buf) { + log_oom(); + return; + } + + (void) set_attribute_and_warn(u, "cpuset", name, buf); +} + +static bool cgroup_context_has_io_config(CGroupContext *c) { + return c->io_accounting || + c->io_weight != CGROUP_WEIGHT_INVALID || + c->startup_io_weight != CGROUP_WEIGHT_INVALID || + c->io_device_weights || + c->io_device_latencies || + c->io_device_limits; +} + +static bool cgroup_context_has_blockio_config(CGroupContext *c) { + return c->blockio_accounting || + c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID || + c->blockio_device_weights || + c->blockio_device_bandwidths; +} + +static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_io_weight != CGROUP_WEIGHT_INVALID) + return c->startup_io_weight; + else if (c->io_weight != CGROUP_WEIGHT_INVALID) + return c->io_weight; + else + return CGROUP_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) { + if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && + c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) + return c->startup_blockio_weight; + else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) + return c->blockio_weight; + else + return CGROUP_BLKIO_WEIGHT_DEFAULT; +} + +static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) { + return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT, + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) { + return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT, + CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX); +} + +static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); +} + +static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf); +} + +static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + if (target != USEC_INFINITY) + xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target); + else + xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev)); + + (void) set_attribute_and_warn(u, "io", "io.latency", buf); +} + +static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) { + char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)]; + char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4]; + CGroupIOLimitType type; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + if (limits[type] != cgroup_io_limit_defaults[type]) + xsprintf(limit_bufs[type], "%" PRIu64, limits[type]); + else + xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0"); + + xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev), + limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX], + limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]); + (void) set_attribute_and_warn(u, "io", "io.max", buf); +} + +static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) { + char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1]; + dev_t dev; + int r; + + r = lookup_block_device(dev_path, &dev); + if (r < 0) + return; + + sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf); + + sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf); +} + +static bool unit_has_unified_memory_config(Unit *u) { + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); + assert(c); + + return unit_get_ancestor_memory_min(u) > 0 || unit_get_ancestor_memory_low(u) > 0 || + c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || + c->memory_swap_max != CGROUP_LIMIT_MAX; +} + +static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n"; + + if (v != CGROUP_LIMIT_MAX) + xsprintf(buf, "%" PRIu64 "\n", v); + + (void) set_attribute_and_warn(u, "memory", file, buf); +} + +static void cgroup_apply_firewall(Unit *u) { + assert(u); + + /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */ + + if (bpf_firewall_compile(u) < 0) + return; + + (void) bpf_firewall_load_custom(u); + (void) bpf_firewall_install(u); +} + +static int cgroup_apply_devices(Unit *u) { + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + const char *path; + CGroupContext *c; + CGroupDeviceAllow *a; + CGroupDevicePolicy policy; + int r; + + assert_se(c = unit_get_cgroup_context(u)); + assert_se(path = u->cgroup_path); + + policy = c->device_policy; + + if (cg_all_unified() > 0) { + r = bpf_devices_cgroup_init(&prog, policy, c->device_allow); + if (r < 0) + return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m"); + + } else { + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore + * EINVAL here. */ + + if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO) + r = cg_set_attribute("devices", path, "devices.deny", "a"); + else + r = cg_set_attribute("devices", path, "devices.allow", "a"); + if (r < 0) + log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to reset devices.allow/devices.deny: %m"); + } + + bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED || + (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow); + if (allow_list_static) + (void) bpf_devices_allow_list_static(prog, path); + + bool any = allow_list_static; + LIST_FOREACH(device_allow, a, c->device_allow) { + char acc[4], *val; + unsigned k = 0; + + if (a->r) + acc[k++] = 'r'; + if (a->w) + acc[k++] = 'w'; + if (a->m) + acc[k++] = 'm'; + if (k == 0) + continue; + acc[k++] = 0; + + if (path_startswith(a->path, "/dev/")) + r = bpf_devices_allow_list_device(prog, path, a->path, acc); + else if ((val = startswith(a->path, "block-"))) + r = bpf_devices_allow_list_major(prog, path, val, 'b', acc); + else if ((val = startswith(a->path, "char-"))) + r = bpf_devices_allow_list_major(prog, path, val, 'c', acc); + else { + log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path); + continue; + } + + if (r >= 0) + any = true; + } + + if (prog && !any) { + log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter."); + + /* The kernel verifier would reject a program we would build with the normal intro and outro + but no allow-listing rules (outro would contain an unreachable instruction for successful + return). */ + policy = CGROUP_DEVICE_POLICY_STRICT; + } + + r = bpf_devices_apply_policy(prog, policy, any, path, &u->bpf_device_control_installed); + if (r < 0) { + static bool warned = false; + + log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r, + "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n" + "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n" + "(This warning is only shown for the first loaded unit using device ACL.)", u->id); + + warned = true; + } + return r; +} + +static void cgroup_context_apply( + Unit *u, + CGroupMask apply_mask, + ManagerState state) { + + const char *path; + CGroupContext *c; + bool is_host_root, is_local_root; + int r; + + assert(u); + + /* Nothing to do? Exit early! */ + if (apply_mask == 0) + return; + + /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other + * attributes should only be managed for cgroups further down the tree. */ + is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE); + is_host_root = unit_has_host_root_cgroup(u); + + assert_se(c = unit_get_cgroup_context(u)); + assert_se(path = u->cgroup_path); + + if (is_local_root) /* Make sure we don't try to display messages with an empty path. */ + path = "/"; + + /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container + * then), and missing cgroups, i.e. EROFS and ENOENT. */ + + /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but + * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this + * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of + * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used + * we couldn't even write to them if we wanted to). */ + if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) { + + if (cg_all_unified() > 0) { + uint64_t weight; + + if (cgroup_context_has_cpu_weight(c)) + weight = cgroup_context_cpu_weight(c, state); + else if (cgroup_context_has_cpu_shares(c)) { + uint64_t shares; + + shares = cgroup_context_cpu_shares(c, state); + weight = cgroup_cpu_shares_to_weight(shares); + + log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s", + shares, weight, path); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + cgroup_apply_unified_cpu_weight(u, weight); + cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec); + + } else { + uint64_t shares; + + if (cgroup_context_has_cpu_weight(c)) { + uint64_t weight; + + weight = cgroup_context_cpu_weight(c, state); + shares = cgroup_cpu_weight_to_shares(weight); + + log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s", + weight, shares, path); + } else if (cgroup_context_has_cpu_shares(c)) + shares = cgroup_context_cpu_shares(c, state); + else + shares = CGROUP_CPU_SHARES_DEFAULT; + + cgroup_apply_legacy_cpu_shares(u, shares); + cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec); + } + } + + if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) { + cgroup_apply_unified_cpuset(u, &c->cpuset_cpus, "cpuset.cpus"); + cgroup_apply_unified_cpuset(u, &c->cpuset_mems, "cpuset.mems"); + } + + /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2 + * controller), and in case of containers we want to leave control of these attributes to the container manager + * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */ + if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) { + char buf[8+DECIMAL_STR_MAX(uint64_t)+1]; + bool has_io, has_blockio; + uint64_t weight; + + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + if (has_io) + weight = cgroup_context_io_weight(c, state); + else if (has_blockio) { + uint64_t blkio_weight; + + blkio_weight = cgroup_context_blkio_weight(c, state); + weight = cgroup_weight_blkio_to_io(blkio_weight); + + log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64, + blkio_weight, weight); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + xsprintf(buf, "default %" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); + + /* FIXME: drop this when distro kernels properly support BFQ through "io.weight" + * See also: https://github.com/systemd/systemd/pull/13335 */ + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "io", "io.bfq.weight", buf); + + if (has_io) { + CGroupIODeviceLatency *latency; + CGroupIODeviceLimit *limit; + CGroupIODeviceWeight *w; + + LIST_FOREACH(device_weights, w, c->io_device_weights) + cgroup_apply_io_device_weight(u, w->path, w->weight); + + LIST_FOREACH(device_limits, limit, c->io_device_limits) + cgroup_apply_io_device_limit(u, limit->path, limit->limits); + + LIST_FOREACH(device_latencies, latency, c->io_device_latencies) + cgroup_apply_io_device_latency(u, latency->path, latency->target_usec); + + } else if (has_blockio) { + CGroupBlockIODeviceWeight *w; + CGroupBlockIODeviceBandwidth *b; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + weight = cgroup_weight_blkio_to_io(w->weight); + + log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_io_device_weight(u, w->path, weight); + } + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; + CGroupIOLimitType type; + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + limits[type] = cgroup_io_limit_defaults[type]; + + limits[CGROUP_IO_RBPS_MAX] = b->rbps; + limits[CGROUP_IO_WBPS_MAX] = b->wbps; + + log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s", + b->rbps, b->wbps, b->path); + + cgroup_apply_io_device_limit(u, b->path, limits); + } + } + } + + if (apply_mask & CGROUP_MASK_BLKIO) { + bool has_io, has_blockio; + + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be + * left to our container manager, too. */ + if (!is_local_root) { + char buf[DECIMAL_STR_MAX(uint64_t)+1]; + uint64_t weight; + + if (has_io) { + uint64_t io_weight; + + io_weight = cgroup_context_io_weight(c, state); + weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state)); + + log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64, + io_weight, weight); + } else if (has_blockio) + weight = cgroup_context_blkio_weight(c, state); + else + weight = CGROUP_BLKIO_WEIGHT_DEFAULT; + + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf); + + /* FIXME: drop this when distro kernels properly support BFQ through "blkio.weight" + * See also: https://github.com/systemd/systemd/pull/13335 */ + xsprintf(buf, "%" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "blkio", "blkio.bfq.weight", buf); + + if (has_io) { + CGroupIODeviceWeight *w; + + LIST_FOREACH(device_weights, w, c->io_device_weights) { + weight = cgroup_weight_io_to_blkio(w->weight); + + log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_blkio_device_weight(u, w->path, weight); + } + } else if (has_blockio) { + CGroupBlockIODeviceWeight *w; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) + cgroup_apply_blkio_device_weight(u, w->path, w->weight); + } + } + + /* The bandwidth limits are something that make sense to be applied to the host's root but not container + * roots, as there we want the container manager to handle it */ + if (is_host_root || !is_local_root) { + if (has_io) { + CGroupIODeviceLimit *l; + + LIST_FOREACH(device_limits, l, c->io_device_limits) { + log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s", + l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path); + + cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]); + } + } else if (has_blockio) { + CGroupBlockIODeviceBandwidth *b; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps); + } + } + } + + /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes' + * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we + * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even + * write to this if we wanted to.) */ + if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) { + + if (cg_all_unified() > 0) { + uint64_t max, swap_max = CGROUP_LIMIT_MAX; + + if (unit_has_unified_memory_config(u)) { + max = c->memory_max; + swap_max = c->memory_swap_max; + } else { + max = c->memory_limit; + + if (max != CGROUP_LIMIT_MAX) + log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max); + } + + cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u)); + cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u)); + cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high); + cgroup_apply_unified_memory_limit(u, "memory.max", max); + cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); + + (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group)); + + } else { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t val; + + if (unit_has_unified_memory_config(u)) { + val = c->memory_max; + log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val); + } else + val = c->memory_limit; + + if (val == CGROUP_LIMIT_MAX) + strncpy(buf, "-1\n", sizeof(buf)); + else + xsprintf(buf, "%" PRIu64 "\n", val); + + (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf); + } + } + + /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of + * containers, where we leave this to the manager */ + if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && + (is_host_root || cg_all_unified() > 0 || !is_local_root)) + (void) cgroup_apply_devices(u); + + if (apply_mask & CGROUP_MASK_PIDS) { + + if (is_host_root) { + /* So, the "pids" controller does not expose anything on the root cgroup, in order not to + * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when + * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a + * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take + * exclusive ownership of the sysctls, but we still want to honour things if the user sets + * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit + * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded) + * it also counts. But if the user never set a limit through us (i.e. we are the default of + * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on + * the first time we set a limit. Note that this boolean is flushed out on manager reload, + * which is desirable so that there's an official way to release control of the sysctl from + * systemd: set the limit to unbounded and reload. */ + + if (tasks_max_isset(&c->tasks_max)) { + u->manager->sysctl_pid_max_changed = true; + r = procfs_tasks_set_limit(tasks_max_resolve(&c->tasks_max)); + } else if (u->manager->sysctl_pid_max_changed) + r = procfs_tasks_set_limit(TASKS_MAX); + else + r = 0; + if (r < 0) + log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, + "Failed to write to tasks limit sysctls: %m"); + } + + /* The attribute itself is not available on the host root cgroup, and in the container case we want to + * leave it for the container manager. */ + if (!is_local_root) { + if (tasks_max_isset(&c->tasks_max)) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + + xsprintf(buf, "%" PRIu64 "\n", tasks_max_resolve(&c->tasks_max)); + (void) set_attribute_and_warn(u, "pids", "pids.max", buf); + } else + (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n"); + } + } + + if (apply_mask & CGROUP_MASK_BPF_FIREWALL) + cgroup_apply_firewall(u); +} + +static bool unit_get_needs_bpf_firewall(Unit *u) { + CGroupContext *c; + Unit *p; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + if (c->ip_accounting || + c->ip_address_allow || + c->ip_address_deny || + c->ip_filters_ingress || + c->ip_filters_egress) + return true; + + /* If any parent slice has an IP access list defined, it applies too */ + for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) { + c = unit_get_cgroup_context(p); + if (!c) + return false; + + if (c->ip_address_allow || + c->ip_address_deny) + return true; + } + + return false; +} + +static CGroupMask unit_get_cgroup_mask(Unit *u) { + CGroupMask mask = 0; + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); + + assert(c); + + /* Figure out which controllers we need, based on the cgroup context object */ + + if (c->cpu_accounting) + mask |= get_cpu_accounting_mask(); + + if (cgroup_context_has_cpu_weight(c) || + cgroup_context_has_cpu_shares(c) || + c->cpu_quota_per_sec_usec != USEC_INFINITY) + mask |= CGROUP_MASK_CPU; + + if (c->cpuset_cpus.set || c->cpuset_mems.set) + mask |= CGROUP_MASK_CPUSET; + + if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c)) + mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + + if (c->memory_accounting || + c->memory_limit != CGROUP_LIMIT_MAX || + unit_has_unified_memory_config(u)) + mask |= CGROUP_MASK_MEMORY; + + if (c->device_allow || + c->device_policy != CGROUP_DEVICE_POLICY_AUTO) + mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES; + + if (c->tasks_accounting || + tasks_max_isset(&c->tasks_max)) + mask |= CGROUP_MASK_PIDS; + + return CGROUP_MASK_EXTEND_JOINED(mask); +} + +static CGroupMask unit_get_bpf_mask(Unit *u) { + CGroupMask mask = 0; + + /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children + * too. */ + + if (unit_get_needs_bpf_firewall(u)) + mask |= CGROUP_MASK_BPF_FIREWALL; + + return mask; +} + +CGroupMask unit_get_own_mask(Unit *u) { + CGroupContext *c; + + /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty + * mask, as we shouldn't reflect it in the cgroup hierarchy then. */ + + if (u->load_state != UNIT_LOADED) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u); +} + +CGroupMask unit_get_delegate_mask(Unit *u) { + CGroupContext *c; + + /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the + * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers. + * + * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */ + + if (!unit_cgroup_delegate(u)) + return 0; + + if (cg_all_unified() <= 0) { + ExecContext *e; + + e = unit_get_exec_context(u); + if (e && !exec_context_maintains_privileges(e)) + return 0; + } + + assert_se(c = unit_get_cgroup_context(u)); + return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers); +} + +static CGroupMask unit_get_subtree_mask(Unit *u) { + + /* Returns the mask of this subtree, meaning of the group + * itself and its children. */ + + return unit_get_own_mask(u) | unit_get_members_mask(u); +} + +CGroupMask unit_get_members_mask(Unit *u) { + assert(u); + + /* Returns the mask of controllers all of the unit's children require, merged */ + + if (u->cgroup_members_mask_valid) + return u->cgroup_members_mask; /* Use cached value if possible */ + + u->cgroup_members_mask = 0; + + if (u->type == UNIT_SLICE) { + void *v; + Unit *member; + + HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE]) + if (UNIT_DEREF(member->slice) == u) + u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */ + } + + u->cgroup_members_mask_valid = true; + return u->cgroup_members_mask; +} + +CGroupMask unit_get_siblings_mask(Unit *u) { + assert(u); + + /* Returns the mask of controllers all of the unit's siblings + * require, i.e. the members mask of the unit's parent slice + * if there is one. */ + + if (UNIT_ISSET(u->slice)) + return unit_get_members_mask(UNIT_DEREF(u->slice)); + + return unit_get_subtree_mask(u); /* we are the top-level slice */ +} + +static CGroupMask unit_get_disable_mask(Unit *u) { + CGroupContext *c; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + return c->disable_controllers; +} + +CGroupMask unit_get_ancestor_disable_mask(Unit *u) { + CGroupMask mask; + + assert(u); + mask = unit_get_disable_mask(u); + + /* Returns the mask of controllers which are marked as forcibly + * disabled in any ancestor unit or the unit in question. */ + + if (UNIT_ISSET(u->slice)) + mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice)); + + return mask; +} + +CGroupMask unit_get_target_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for a specific cgroup, i.e. everything it needs itself, + * plus all that its children need, plus all that its siblings + * need. This is primarily useful on the legacy cgroup + * hierarchy, where we need to duplicate each cgroup in each + * hierarchy that shall be enabled for it. */ + + mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u); + + if (mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported) + emit_bpf_firewall_warning(u); + + mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); + + return mask; +} + +CGroupMask unit_get_enable_mask(Unit *u) { + CGroupMask mask; + + /* This returns the cgroup mask of all controllers to enable + * for the children of a specific cgroup. This is primarily + * useful for the unified cgroup hierarchy, where each cgroup + * controls which controllers are enabled for its children. */ + + mask = unit_get_members_mask(u); + mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); + + return mask; +} + +void unit_invalidate_cgroup_members_masks(Unit *u) { + assert(u); + + /* Recurse invalidate the member masks cache all the way up the tree */ + u->cgroup_members_mask_valid = false; + + if (UNIT_ISSET(u->slice)) + unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice)); +} + +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { + + /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */ + + while (u) { + + if (u->cgroup_path && + u->cgroup_realized && + FLAGS_SET(u->cgroup_realized_mask, mask)) + return u->cgroup_path; + + u = UNIT_DEREF(u->slice); + } + + return NULL; +} + +static const char *migrate_callback(CGroupMask mask, void *userdata) { + /* If not realized at all, migrate to root (""). + * It may happen if we're upgrading from older version that didn't clean up. + */ + return strempty(unit_get_realized_cgroup_path(userdata, mask)); +} + +char *unit_default_cgroup_path(const Unit *u) { + _cleanup_free_ char *escaped = NULL, *slice = NULL; + int r; + + assert(u); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return strdup(u->manager->cgroup_root); + + if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) { + r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice); + if (r < 0) + return NULL; + } + + escaped = cg_escape(u->id); + if (!escaped) + return NULL; + + return path_join(empty_to_root(u->manager->cgroup_root), slice, escaped); +} + +int unit_set_cgroup_path(Unit *u, const char *path) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + if (streq_ptr(u->cgroup_path, path)) + return 0; + + if (path) { + p = strdup(path); + if (!p) + return -ENOMEM; + } + + if (p) { + r = hashmap_put(u->manager->cgroup_unit, p, u); + if (r < 0) + return r; + } + + unit_release_cgroup(u); + u->cgroup_path = TAKE_PTR(p); + + return 1; +} + +int unit_watch_cgroup(Unit *u) { + _cleanup_free_ char *events = NULL; + int r; + + assert(u); + + /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if + * cgroupv2 is available. */ + + if (!u->cgroup_path) + return 0; + + if (u->cgroup_control_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m"); + if (r == 0) + return 0; + + /* No point in watch the top-level slice, it's never going to run empty. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events); + if (r < 0) + return log_oom(); + + u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_control_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path); + } + + r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m"); + + return 0; +} + +int unit_watch_cgroup_memory(Unit *u) { + _cleanup_free_ char *events = NULL; + CGroupContext *c; + int r; + + assert(u); + + /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if + * cgroupv2 is available. */ + + if (!u->cgroup_path) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie + * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after + * all. */ + if (!c->memory_accounting) + return 0; + + /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and + * we also don't want to generate a log message for each parent cgroup of a process. */ + if (u->type == UNIT_SLICE) + return 0; + + if (u->cgroup_memory_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m"); + if (r == 0) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events); + if (r < 0) + return log_oom(); + + u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_memory_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path); + } + + r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m"); + + return 0; +} + +int unit_pick_cgroup_path(Unit *u) { + _cleanup_free_ char *path = NULL; + int r; + + assert(u); + + if (u->cgroup_path) + return 0; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + path = unit_default_cgroup_path(u); + if (!path) + return log_oom(); + + r = unit_set_cgroup_path(u, path); + if (r == -EEXIST) + return log_unit_error_errno(u, r, "Control group %s exists already.", path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path); + + return 0; +} + +static int cg_v1_errno_to_log_level(int r) { + return r == -EROFS ? LOG_DEBUG : LOG_WARNING; +} + +static int unit_update_cgroup( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask, + ManagerState state) { + + bool created, is_root_slice; + CGroupMask migrate_mask = 0; + int r; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* Figure out our cgroup path */ + r = unit_pick_cgroup_path(u); + if (r < 0) + return r; + + /* First, create our own group */ + r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path); + created = r; + + /* Start watching it */ + (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); + + + /* For v2 we preserve enabled controllers in delegated units, adjust others, + * for v1 we figure out which controller hierarchies need migration. */ + if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { + CGroupMask result_mask = 0; + + /* Enable all controllers we need */ + r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path); + + /* Remember what's actually enabled now */ + u->cgroup_enabled_mask = result_mask; + + migrate_mask = u->cgroup_realized_mask ^ target_mask; + } + + /* Keep track that this is now realized */ + u->cgroup_realized = true; + u->cgroup_realized_mask = target_mask; + + /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling). + * + * Unnecessary controller cgroups are trimmed (after emptied by upward migration). + * We perform migration also with whole slices for cases when users don't care about leave + * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing + * delegated units. + * + * If we're in an nspawn container and using legacy cgroups, the controller hierarchies are mounted + * read-only into the container. We skip migration/trim in this scenario since it would fail + * regardless with noisy "Read-only filesystem" warnings. + */ + if (cg_all_unified() == 0) { + r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u); + if (r < 0) + log_unit_full_errno( + u, + cg_v1_errno_to_log_level(r), + r, + "Failed to migrate controller cgroups from %s, ignoring: %m", + u->cgroup_path); + + is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); + r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice); + if (r < 0) + log_unit_full_errno( + u, + cg_v1_errno_to_log_level(r), + r, + "Failed to delete controller cgroups %s, ignoring: %m", + u->cgroup_path); + } + + /* Set attributes */ + cgroup_context_apply(u, target_mask, state); + cgroup_xattr_apply(u); + + return 0; +} + +static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + char *pp; + int r; + + assert(u); + + if (MANAGER_IS_SYSTEM(u->manager)) + return -EINVAL; + + if (!u->manager->system_bus) + return -EIO; + + if (!u->cgroup_path) + return -EINVAL; + + /* Determine this unit's cgroup path relative to our cgroup root */ + pp = path_startswith(u->cgroup_path, u->manager->cgroup_root); + if (!pp) + return -EINVAL; + + pp = strjoina("/", pp, suffix_path); + path_simplify(pp, false); + + r = sd_bus_call_method(u->manager->system_bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "AttachProcessesToUnit", + &error, NULL, + "ssau", + NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r)); + + return 0; +} + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { + CGroupMask delegated_mask; + const char *p; + void *pidp; + int r, q; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + if (set_isempty(pids)) + return 0; + + /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable. + * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */ + r = bpf_firewall_load_custom(u); + if (r < 0) + return r; + + r = unit_realize_cgroup(u); + if (r < 0) + return r; + + if (isempty(suffix_path)) + p = u->cgroup_path; + else + p = prefix_roota(u->cgroup_path, suffix_path); + + delegated_mask = unit_get_delegate_mask(u); + + r = 0; + SET_FOREACH(pidp, pids) { + pid_t pid = PTR_TO_PID(pidp); + CGroupController c; + + /* First, attach the PID to the main cgroup hierarchy */ + q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid); + if (q < 0) { + log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p); + + if (MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(q)) { + int z; + + /* If we are in a user instance, and we can't move the process ourselves due to + * permission problems, let's ask the system instance about it instead. Since it's more + * privileged it might be able to move the process across the leaves of a subtree who's + * top node is not owned by us. */ + + z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path); + if (z < 0) + log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p); + else + continue; /* When the bus thing worked via the bus we are fully done for this PID. */ + } + + if (r >= 0) + r = q; /* Remember first error */ + + continue; + } + + q = cg_all_unified(); + if (q < 0) + return q; + if (q > 0) + continue; + + /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the + * innermost realized one */ + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *realized; + + if (!(u->manager->cgroup_supported & bit)) + continue; + + /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */ + if (delegated_mask & u->cgroup_realized_mask & bit) { + q = cg_attach(cgroup_controller_to_string(c), p, pid); + if (q >= 0) + continue; /* Success! */ + + log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m", + pid, p, cgroup_controller_to_string(c)); + } + + /* So this controller is either not delegate or realized, or something else weird happened. In + * that case let's attach the PID at least to the closest cgroup up the tree that is + * realized. */ + realized = unit_get_realized_cgroup_path(u, bit); + if (!realized) + continue; /* Not even realized in the root slice? Then let's not bother */ + + q = cg_attach(cgroup_controller_to_string(c), realized, pid); + if (q < 0) + log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m", + pid, realized, cgroup_controller_to_string(c)); + } + } + + return r; +} + +static bool unit_has_mask_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if this unit is fully realized. We check four things: + * + * 1. Whether the cgroup was created at all + * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1) + * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2) + * 4. Whether the invalidation mask is currently zero + * + * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note + * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for + * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask + * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they + * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are + * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they + * simply don't matter. */ + + return u->cgroup_realized && + ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && + ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && + u->cgroup_invalidated_mask == 0; +} + +static bool unit_has_mask_disables_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if all controllers which should be disabled are indeed disabled. + * + * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is + * already removed. */ + + return !u->cgroup_realized || + (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) && + FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2)); +} + +static bool unit_has_mask_enables_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask) { + + assert(u); + + /* Returns true if all controllers which should be enabled are indeed enabled. + * + * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything + * we want to add is already added. */ + + return u->cgroup_realized && + ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) && + ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2); +} + +static void unit_add_to_cgroup_realize_queue(Unit *u) { + assert(u); + + if (u->in_cgroup_realize_queue) + return; + + LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + u->in_cgroup_realize_queue = true; +} + +static void unit_remove_from_cgroup_realize_queue(Unit *u) { + assert(u); + + if (!u->in_cgroup_realize_queue) + return; + + LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + u->in_cgroup_realize_queue = false; +} + +/* Controllers can only be enabled breadth-first, from the root of the + * hierarchy downwards to the unit in question. */ +static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) { + CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; + int r; + + assert(u); + + /* First go deal with this unit's parent, or we won't be able to enable + * any new controllers at this layer. */ + if (UNIT_ISSET(u->slice)) { + r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state); + if (r < 0) + return r; + } + + target_mask = unit_get_target_mask(u); + enable_mask = unit_get_enable_mask(u); + + /* We can only enable in this direction, don't try to disable anything. + */ + if (unit_has_mask_enables_realized(u, target_mask, enable_mask)) + return 0; + + new_target_mask = u->cgroup_realized_mask | target_mask; + new_enable_mask = u->cgroup_enabled_mask | enable_mask; + + return unit_update_cgroup(u, new_target_mask, new_enable_mask, state); +} + +/* Controllers can only be disabled depth-first, from the leaves of the + * hierarchy upwards to the unit in question. */ +static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { + Unit *m; + void *v; + + assert(u); + + if (u->type != UNIT_SLICE) + return 0; + + HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE]) { + CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; + int r; + + if (UNIT_DEREF(m->slice) != u) + continue; + + /* The cgroup for this unit might not actually be fully + * realised yet, in which case it isn't holding any controllers + * open anyway. */ + if (!m->cgroup_realized) + continue; + + /* We must disable those below us first in order to release the + * controller. */ + if (m->type == UNIT_SLICE) + (void) unit_realize_cgroup_now_disable(m, state); + + target_mask = unit_get_target_mask(m); + enable_mask = unit_get_enable_mask(m); + + /* We can only disable in this direction, don't try to enable + * anything. */ + if (unit_has_mask_disables_realized(m, target_mask, enable_mask)) + continue; + + new_target_mask = m->cgroup_realized_mask & target_mask; + new_enable_mask = m->cgroup_enabled_mask & enable_mask; + + r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state); + if (r < 0) + return r; + } + + return 0; +} + +/* Check if necessary controllers and attributes for a unit are in place. + * + * - If so, do nothing. + * - If not, create paths, move processes over, and set attributes. + * + * Controllers can only be *enabled* in a breadth-first way, and *disabled* in + * a depth-first way. As such the process looks like this: + * + * Suppose we have a cgroup hierarchy which looks like this: + * + * root + * / \ + * / \ + * / \ + * a b + * / \ / \ + * / \ / \ + * c d e f + * / \ / \ / \ / \ + * h i j k l m n o + * + * 1. We want to realise cgroup "d" now. + * 2. cgroup "a" has DisableControllers=cpu in the associated unit. + * 3. cgroup "k" just started requesting the memory controller. + * + * To make this work we must do the following in order: + * + * 1. Disable CPU controller in k, j + * 2. Disable CPU controller in d + * 3. Enable memory controller in root + * 4. Enable memory controller in a + * 5. Enable memory controller in d + * 6. Enable memory controller in k + * + * Notice that we need to touch j in one direction, but not the other. We also + * don't go beyond d when disabling -- it's up to "a" to get realized if it + * wants to disable further. The basic rules are therefore: + * + * - If you're disabling something, you need to realise all of the cgroups from + * your recursive descendants to the root. This starts from the leaves. + * - If you're enabling something, you need to realise from the root cgroup + * downwards, but you don't need to iterate your recursive descendants. + * + * Returns 0 on success and < 0 on failure. */ +static int unit_realize_cgroup_now(Unit *u, ManagerState state) { + CGroupMask target_mask, enable_mask; + int r; + + assert(u); + + unit_remove_from_cgroup_realize_queue(u); + + target_mask = unit_get_target_mask(u); + enable_mask = unit_get_enable_mask(u); + + if (unit_has_mask_realized(u, target_mask, enable_mask)) + return 0; + + /* Disable controllers below us, if there are any */ + r = unit_realize_cgroup_now_disable(u, state); + if (r < 0) + return r; + + /* Enable controllers above us, if there are any */ + if (UNIT_ISSET(u->slice)) { + r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state); + if (r < 0) + return r; + } + + /* Now actually deal with the cgroup we were trying to realise and set attributes */ + r = unit_update_cgroup(u, target_mask, enable_mask, state); + if (r < 0) + return r; + + /* Now, reset the invalidation mask */ + u->cgroup_invalidated_mask = 0; + return 0; +} + +unsigned manager_dispatch_cgroup_realize_queue(Manager *m) { + ManagerState state; + unsigned n = 0; + Unit *i; + int r; + + assert(m); + + state = manager_state(m); + + while ((i = m->cgroup_realize_queue)) { + assert(i->in_cgroup_realize_queue); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) { + /* Maybe things changed, and the unit is not actually active anymore? */ + unit_remove_from_cgroup_realize_queue(i); + continue; + } + + r = unit_realize_cgroup_now(i, state); + if (r < 0) + log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id); + + n++; + } + + return n; +} + +void unit_add_family_to_cgroup_realize_queue(Unit *u) { + assert(u); + assert(u->type == UNIT_SLICE); + + /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all + * its ancestors. + * + * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes + * very weird if two units that own processes reside in the same slice, but one is realized in the + * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does + * not), because that means individual processes need to be scheduled against whole cgroups. Let's + * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1 + * controller hierarchies too (if unit requires the controller to be realized). + * + * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date + * masks. */ + + do { + Unit *m; + void *v; + + /* Children of u likely changed when we're called */ + u->cgroup_members_mask_valid = false; + + HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE]) { + /* Skip units that have a dependency on the slice but aren't actually in it. */ + if (UNIT_DEREF(m->slice) != u) + continue; + + /* No point in doing cgroup application for units without active processes. */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m))) + continue; + + /* We only enqueue siblings if they were realized once at least, in the main + * hierarchy. */ + if (!m->cgroup_realized) + continue; + + /* If the unit doesn't need any new controllers and has current ones realized, it + * doesn't need any changes. */ + if (unit_has_mask_realized(m, + unit_get_target_mask(m), + unit_get_enable_mask(m))) + continue; + + unit_add_to_cgroup_realize_queue(m); + } + + /* Parent comes after children */ + unit_add_to_cgroup_realize_queue(u); + } while ((u = UNIT_DEREF(u->slice))); +} + +int unit_realize_cgroup(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all + * parents, but there's more actually: for the weight-based controllers we also need to make sure + * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the + * other hand, when a controller is removed from realized set, it may become unnecessary in siblings + * and ancestors and they should be (de)realized too. + * + * This call will defer work on the siblings and derealized ancestors to the next event loop + * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */ + + if (UNIT_ISSET(u->slice)) + unit_add_family_to_cgroup_realize_queue(UNIT_DEREF(u->slice)); + + /* And realize this one now (and apply the values) */ + return unit_realize_cgroup_now(u, manager_state(u->manager)); +} + +void unit_release_cgroup(Unit *u) { + assert(u); + + /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call + * when we close down everything for reexecution, where we really want to leave the cgroup in place. */ + + if (u->cgroup_path) { + (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); + u->cgroup_path = mfree(u->cgroup_path); + } + + if (u->cgroup_control_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd)); + u->cgroup_control_inotify_wd = -1; + } + + if (u->cgroup_memory_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd)); + u->cgroup_memory_inotify_wd = -1; + } +} + +bool unit_maybe_release_cgroup(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return true; + + /* Don't release the cgroup if there are still processes under it. If we get notified later when all the + * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed) + * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned + * up later. */ + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) + log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m"); + else if (r == 1) { + unit_release_cgroup(u); + return true; + } + + return false; +} + +void unit_prune_cgroup(Unit *u) { + int r; + bool is_root_slice; + + assert(u); + + /* Removes the cgroup, if empty and possible, and stops watching it. */ + + if (!u->cgroup_path) + return; + + (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */ + + is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); + + r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice); + if (r < 0) + /* One reason we could have failed here is, that the cgroup still contains a process. + * However, if the cgroup becomes removable at a later time, it might be removed when + * the containing slice is stopped. So even if we failed now, this unit shouldn't assume + * that the cgroup is still realized the next time it is started. Do not return early + * on error, continue cleanup. */ + log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path); + + if (is_root_slice) + return; + + if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */ + return; + + u->cgroup_realized = false; + u->cgroup_realized_mask = 0; + u->cgroup_enabled_mask = 0; + + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); +} + +int unit_search_main_pid(Unit *u, pid_t *ret) { + _cleanup_fclose_ FILE *f = NULL; + pid_t pid = 0, npid; + int r; + + assert(u); + assert(ret); + + if (!u->cgroup_path) + return -ENXIO; + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f); + if (r < 0) + return r; + + while (cg_read_pid(f, &npid) > 0) { + + if (npid == pid) + continue; + + if (pid_is_my_child(npid) == 0) + continue; + + if (pid != 0) + /* Dang, there's more than one daemonized PID + in this group, so we don't know what process + is the main process. */ + + return -ENODATA; + + pid = npid; + } + + *ret = pid; + return 0; +} + +static int unit_watch_pids_in_path(Unit *u, const char *path) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int ret = 0, r; + + assert(u); + assert(path); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f); + if (r < 0) + ret = r; + else { + pid_t pid; + + while ((r = cg_read_pid(f, &pid)) > 0) { + r = unit_watch_pid(u, pid, false); + if (r < 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) { + if (ret >= 0) + ret = r; + } else { + char *fn; + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = path_join(empty_to_root(path), fn); + free(fn); + + if (!p) + return -ENOMEM; + + r = unit_watch_pids_in_path(u, p); + if (r < 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + } + + return ret; +} + +int unit_synthesize_cgroup_empty_event(Unit *u) { + int r; + + assert(u); + + /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility + * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can + * get as notification source as soon as we stopped having any useful PIDs to watch for. */ + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we have reliable notifications, and don't need this */ + return 0; + + if (!set_isempty(u->pids)) + return 0; + + unit_add_to_cgroup_empty_queue(u); + return 0; +} + +int unit_watch_all_pids(Unit *u) { + int r; + + assert(u); + + /* Adds all PIDs from our cgroup to the set of PIDs we + * watch. This is a fallback logic for cases where we do not + * get reliable cgroup empty notifications: we try to use + * SIGCHLD as replacement. */ + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + return unit_watch_pids_in_path(u, u->cgroup_path); +} + +static int on_cgroup_empty_event(sd_event_source *s, void *userdata) { + Manager *m = userdata; + Unit *u; + int r; + + assert(s); + assert(m); + + u = m->cgroup_empty_queue; + if (!u) + return 0; + + assert(u->in_cgroup_empty_queue); + u->in_cgroup_empty_queue = false; + LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u); + + if (m->cgroup_empty_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m"); + } + + unit_add_to_gc_queue(u); + + if (UNIT_VTABLE(u)->notify_cgroup_empty) + UNIT_VTABLE(u)->notify_cgroup_empty(u); + + return 0; +} + +void unit_add_to_cgroup_empty_queue(Unit *u) { + int r; + + assert(u); + + /* Note that there are four different ways how cgroup empty events reach us: + * + * 1. On the unified hierarchy we get an inotify event on the cgroup + * + * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket + * + * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus + * + * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as + * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications. + * + * Regardless which way we got the notification, we'll verify it here, and then add it to a separate + * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use + * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending + * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the + * case for scope units). */ + + if (u->in_cgroup_empty_queue) + return; + + /* Let's verify that the cgroup is really empty */ + if (!u->cgroup_path) + return; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path); + return; + } + if (r == 0) + return; + + LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + u->in_cgroup_empty_queue = true; + + /* Trigger the defer event */ + r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to enable cgroup empty event source: %m"); +} + +static void unit_remove_from_cgroup_empty_queue(Unit *u) { + assert(u); + + if (!u->in_cgroup_empty_queue) + return; + + LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + u->in_cgroup_empty_queue = false; +} + +int unit_check_oomd_kill(Unit *u) { + _cleanup_free_ char *value = NULL; + bool increased; + uint64_t n = 0; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_all_unified(); + if (r < 0) + return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m"); + else if (r == 0) + return 0; + + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value); + if (r < 0 && r != -ENODATA) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &n); + if (r < 0) + return r; + } + + increased = n > u->managed_oom_kill_last; + u->managed_oom_kill_last = n; + + if (!increased) + return 0; + + if (n > 0) + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n)); + + return 1; +} + +int unit_check_oom(Unit *u) { + _cleanup_free_ char *oom_kill = NULL; + bool increased; + uint64_t c; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m"); + + r = safe_atou64(oom_kill, &c); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m"); + + increased = c > u->oom_kill_last; + u->oom_kill_last = c; + + if (!increased) + return 0; + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer.")); + + if (UNIT_VTABLE(u)->notify_cgroup_oom) + UNIT_VTABLE(u)->notify_cgroup_oom(u); + + return 1; +} + +static int on_cgroup_oom_event(sd_event_source *s, void *userdata) { + Manager *m = userdata; + Unit *u; + int r; + + assert(s); + assert(m); + + u = m->cgroup_oom_queue; + if (!u) + return 0; + + assert(u->in_cgroup_oom_queue); + u->in_cgroup_oom_queue = false; + LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u); + + if (m->cgroup_oom_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m"); + } + + (void) unit_check_oom(u); + return 0; +} + +static void unit_add_to_cgroup_oom_queue(Unit *u) { + int r; + + assert(u); + + if (u->in_cgroup_oom_queue) + return; + if (!u->cgroup_path) + return; + + LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u); + u->in_cgroup_oom_queue = true; + + /* Trigger the defer event */ + if (!u->manager->cgroup_oom_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager); + if (r < 0) { + log_error_errno(r, "Failed to create cgroup oom event source: %m"); + return; + } + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) { + log_error_errno(r, "Failed to set priority of cgroup oom event source: %m"); + return; + } + + (void) sd_event_source_set_description(s, "cgroup-oom"); + u->manager->cgroup_oom_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_error_errno(r, "Failed to enable cgroup oom event source: %m"); +} + +static int unit_check_cgroup_events(Unit *u) { + char *values[2] = {}; + int r; + + assert(u); + + r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", + STRV_MAKE("populated", "frozen"), values); + if (r < 0) + return r; + + /* The cgroup.events notifications can be merged together so act as we saw the given state for the + * first time. The functions we call to handle given state are idempotent, which makes them + * effectively remember the previous state. */ + if (values[0]) { + if (streq(values[0], "1")) + unit_remove_from_cgroup_empty_queue(u); + else + unit_add_to_cgroup_empty_queue(u); + } + + /* Disregard freezer state changes due to operations not initiated by us */ + if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) { + if (streq(values[1], "0")) + unit_thawed(u); + else + unit_frozen(u); + } + + free(values[0]); + free(values[1]); + + return 0; +} + +static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(s); + assert(fd >= 0); + assert(m); + + for (;;) { + union inotify_event_buffer buffer; + struct inotify_event *e; + ssize_t l; + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + return log_error_errno(errno, "Failed to read control group inotify events: %m"); + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + Unit *u; + + if (e->wd < 0) + /* Queue overflow has no watch descriptor */ + continue; + + if (e->mask & IN_IGNORED) + /* The watch was just removed */ + continue; + + /* Note that inotify might deliver events for a watch even after it was removed, + * because it was queued before the removal. Let's ignore this here safely. */ + + u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_check_cgroup_events(u); + + u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_add_to_cgroup_oom_queue(u); + } + } +} + +static int cg_bpf_mask_supported(CGroupMask *ret) { + CGroupMask mask = 0; + int r; + + /* BPF-based firewall */ + r = bpf_firewall_supported(); + if (r > 0) + mask |= CGROUP_MASK_BPF_FIREWALL; + + /* BPF-based device access control */ + r = bpf_devices_supported(); + if (r > 0) + mask |= CGROUP_MASK_BPF_DEVICES; + + *ret = mask; + return 0; +} + +int manager_setup_cgroup(Manager *m) { + _cleanup_free_ char *path = NULL; + const char *scope_path; + CGroupController c; + int r, all_unified; + CGroupMask mask; + char *e; + + assert(m); + + /* 1. Determine hierarchy */ + m->cgroup_root = mfree(m->cgroup_root); + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root); + if (r < 0) + return log_error_errno(r, "Cannot determine cgroup we are running in: %m"); + + /* Chop off the init scope, if we are already located in it */ + e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + + /* LEGACY: Also chop off the system slice if we are in + * it. This is to support live upgrades from older systemd + * versions where PID 1 was moved there. Also see + * cg_get_root_path(). */ + if (!e && MANAGER_IS_SYSTEM(m)) { + e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE); + if (!e) + e = endswith(m->cgroup_root, "/system"); /* even more legacy */ + } + if (e) + *e = 0; + + /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can + * easily prepend it everywhere. */ + delete_trailing_chars(m->cgroup_root, "/"); + + /* 2. Show data */ + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path); + if (r < 0) + return log_error_errno(r, "Cannot find cgroup mount point: %m"); + + r = cg_unified(); + if (r < 0) + return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m"); + + all_unified = cg_all_unified(); + if (all_unified < 0) + return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m"); + if (all_unified > 0) + log_debug("Unified cgroup hierarchy is located at %s.", path); + else { + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m"); + if (r > 0) + log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path); + else + log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path); + } + + /* 3. Allocate cgroup empty defer event source */ + m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); + r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m); + if (r < 0) + return log_error_errno(r, "Failed to create cgroup empty event source: %m"); + + /* Schedule cgroup empty checks early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of + * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */ + r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m"); + + r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to disable cgroup empty event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty"); + + /* 4. Install notifier inotify object, or agent */ + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { + + /* In the unified hierarchy we can get cgroup empty notifications via inotify. */ + + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + safe_close(m->cgroup_inotify_fd); + + m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->cgroup_inotify_fd < 0) + return log_error_errno(errno, "Failed to create control group inotify object: %m"); + + r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m); + if (r < 0) + return log_error_errno(r, "Failed to watch control group inotify object: %m"); + + /* Process cgroup empty notifications early. Note that when this event is dispatched it'll + * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see + * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9); + if (r < 0) + return log_error_errno(r, "Failed to set priority of inotify event source: %m"); + + (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); + + } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) { + + /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable, + * since it does not generate events when control groups with children run empty. */ + + r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH); + if (r < 0) + log_warning_errno(r, "Failed to install release agent, ignoring: %m"); + else if (r > 0) + log_debug("Installed release agent."); + else if (r == 0) + log_debug("Release agent already installed."); + } + + /* 5. Make sure we are in the special "init.scope" unit in the root slice. */ + scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r >= 0) { + /* Also, move all other userspace processes remaining in the root cgroup into that scope. */ + r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r < 0) + log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m"); + + /* 6. And pin it, so that it cannot be unmounted */ + safe_close(m->pin_cgroupfs_fd); + m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK); + if (m->pin_cgroupfs_fd < 0) + return log_error_errno(errno, "Failed to open pin file: %m"); + + } else if (!MANAGER_IS_TEST_RUN(m)) + return log_error_errno(r, "Failed to create %s control group: %m", scope_path); + + /* 7. Always enable hierarchical support if it exists... */ + if (!all_unified && !MANAGER_IS_TEST_RUN(m)) + (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); + + /* 8. Figure out which controllers are supported */ + r = cg_mask_supported(&m->cgroup_supported); + if (r < 0) + return log_error_errno(r, "Failed to determine supported controllers: %m"); + + /* 9. Figure out which bpf-based pseudo-controllers are supported */ + r = cg_bpf_mask_supported(&mask); + if (r < 0) + return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m"); + m->cgroup_supported |= mask; + + /* 10. Log which controllers are supported */ + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) + log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c))); + + return 0; +} + +void manager_shutdown_cgroup(Manager *m, bool delete) { + assert(m); + + /* We can't really delete the group, since we are in it. But + * let's trim it. */ + if (delete && m->cgroup_root && m->test_run_flags != MANAGER_TEST_RUN_MINIMAL) + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false); + + m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); + + m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit); + m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit); + + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); + + m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd); + + m->cgroup_root = mfree(m->cgroup_root); +} + +Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { + char *p; + Unit *u; + + assert(m); + assert(cgroup); + + u = hashmap_get(m->cgroup_unit, cgroup); + if (u) + return u; + + p = strdupa(cgroup); + for (;;) { + char *e; + + e = strrchr(p, '/'); + if (!e || e == p) + return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE); + + *e = 0; + + u = hashmap_get(m->cgroup_unit, p); + if (u) + return u; + } +} + +Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) { + _cleanup_free_ char *cgroup = NULL; + + assert(m); + + if (!pid_is_valid(pid)) + return NULL; + + if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0) + return NULL; + + return manager_get_unit_by_cgroup(m, cgroup); +} + +Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) { + Unit *u, **array; + + assert(m); + + /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most + * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most + * relevant one as children of the process will be assigned to that one, too, before all else. */ + + if (!pid_is_valid(pid)) + return NULL; + + if (pid == getpid_cached()) + return hashmap_get(m->units, SPECIAL_INIT_SCOPE); + + u = manager_get_unit_by_pid_cgroup(m, pid); + if (u) + return u; + + u = hashmap_get(m->watch_pids, PID_TO_PTR(pid)); + if (u) + return u; + + array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid)); + if (array) + return array[0]; + + return NULL; +} + +int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { + Unit *u; + + assert(m); + assert(cgroup); + + /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process + * or from the --system instance */ + + log_debug("Got cgroup empty notification for: %s", cgroup); + + u = manager_get_unit_by_cgroup(m, cgroup); + if (!u) + return 0; + + unit_add_to_cgroup_empty_queue(u); + return 1; +} + +int unit_get_memory_current(Unit *u, uint64_t *ret) { + int r; + + assert(u); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, memory_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_memory_get_used(ret); + + if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + + return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret); +} + +int unit_get_tasks_current(Unit *u, uint64_t *ret) { + assert(u); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, tasks_accounting)) + return -ENODATA; + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_tasks_get_current(ret); + + if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0) + return -ENODATA; + + return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret); +} + +static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { + uint64_t ns; + int r; + + assert(u); + assert(ret); + + if (!u->cgroup_path) + return -ENODATA; + + /* The root cgroup doesn't expose this information, let's get it from /proc instead */ + if (unit_has_host_root_cgroup(u)) + return procfs_cpu_get_usage(ret); + + /* Requisite controllers for CPU accounting are not enabled */ + if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) { + _cleanup_free_ char *val = NULL; + uint64_t us; + + r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val); + if (IN_SET(r, -ENOENT, -ENXIO)) + return -ENODATA; + if (r < 0) + return r; + + r = safe_atou64(val, &us); + if (r < 0) + return r; + + ns = us * NSEC_PER_USEC; + } else + return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret); + + *ret = ns; + return 0; +} + +int unit_get_cpu_usage(Unit *u, nsec_t *ret) { + nsec_t ns; + int r; + + assert(u); + + /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was + * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply + * call this function with a NULL return value. */ + + if (!UNIT_CGROUP_BOOL(u, cpu_accounting)) + return -ENODATA; + + r = unit_get_cpu_usage_raw(u, &ns); + if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) { + /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our + * cached value. */ + + if (ret) + *ret = u->cpu_usage_last; + return 0; + } + if (r < 0) + return r; + + if (ns > u->cpu_usage_base) + ns -= u->cpu_usage_base; + else + ns = 0; + + u->cpu_usage_last = ns; + if (ret) + *ret = ns; + + return 0; +} + +int unit_get_ip_accounting( + Unit *u, + CGroupIPAccountingMetric metric, + uint64_t *ret) { + + uint64_t value; + int fd, r; + + assert(u); + assert(metric >= 0); + assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX); + assert(ret); + + if (!UNIT_CGROUP_BOOL(u, ip_accounting)) + return -ENODATA; + + fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + if (fd < 0) + return -ENODATA; + + if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) + r = bpf_firewall_read_accounting(fd, &value, NULL); + else + r = bpf_firewall_read_accounting(fd, NULL, &value); + if (r < 0) + return r; + + /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile + * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the + * ip_accounting_extra[] field, and add them in here transparently. */ + + *ret = value + u->ip_accounting_extra[metric]; + + return r; +} + +static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) { + static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "rbytes=", + [CGROUP_IO_WRITE_BYTES] = "wbytes=", + [CGROUP_IO_READ_OPERATIONS] = "rios=", + [CGROUP_IO_WRITE_OPERATIONS] = "wios=", + }; + uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {}; + _cleanup_free_ char *path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(u); + + if (!u->cgroup_path) + return -ENODATA; + + if (unit_has_host_root_cgroup(u)) + return -ENODATA; /* TODO: return useful data for the top-level cgroup */ + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) /* TODO: support cgroupv1 */ + return -ENODATA; + + if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO)) + return -ENODATA; + + r = cg_get_path("io", u->cgroup_path, "io.stat", &path); + if (r < 0) + return r; + + f = fopen(path, "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + p = line; + p += strcspn(p, WHITESPACE); /* Skip over device major/minor */ + p += strspn(p, WHITESPACE); /* Skip over following whitespace */ + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) { + const char *x; + + x = startswith(word, field_names[i]); + if (x) { + uint64_t w; + + r = safe_atou64(x, &w); + if (r < 0) + return r; + + /* Sum up the stats of all devices */ + acc[i] += w; + break; + } + } + } + } + + memcpy(ret, acc, sizeof(acc)); + return 0; +} + +int unit_get_io_accounting( + Unit *u, + CGroupIOAccountingMetric metric, + bool allow_cache, + uint64_t *ret) { + + uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; + int r; + + /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */ + + if (!UNIT_CGROUP_BOOL(u, io_accounting)) + return -ENODATA; + + if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX) + goto done; + + r = unit_get_io_accounting_raw(u, raw); + if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX) + goto done; + if (r < 0) + return r; + + for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) { + /* Saturated subtraction */ + if (raw[i] > u->io_accounting_base[i]) + u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i]; + else + u->io_accounting_last[i] = 0; + } + +done: + if (ret) + *ret = u->io_accounting_last[metric]; + + return 0; +} + +int unit_reset_cpu_accounting(Unit *u) { + int r; + + assert(u); + + u->cpu_usage_last = NSEC_INFINITY; + + r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base); + if (r < 0) { + u->cpu_usage_base = 0; + return r; + } + + return 0; +} + +int unit_reset_ip_accounting(Unit *u) { + int r = 0, q = 0; + + assert(u); + + if (u->ip_accounting_ingress_map_fd >= 0) + r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd); + + if (u->ip_accounting_egress_map_fd >= 0) + q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd); + + zero(u->ip_accounting_extra); + + return r < 0 ? r : q; +} + +int unit_reset_io_accounting(Unit *u) { + int r; + + assert(u); + + for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) + u->io_accounting_last[i] = UINT64_MAX; + + r = unit_get_io_accounting_raw(u, u->io_accounting_base); + if (r < 0) { + zero(u->io_accounting_base); + return r; + } + + return 0; +} + +int unit_reset_accounting(Unit *u) { + int r, q, v; + + assert(u); + + r = unit_reset_cpu_accounting(u); + q = unit_reset_io_accounting(u); + v = unit_reset_ip_accounting(u); + + return r < 0 ? r : q < 0 ? q : v; +} + +void unit_invalidate_cgroup(Unit *u, CGroupMask m) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (m == 0) + return; + + /* always invalidate compat pairs together */ + if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)) + m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + + if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT)) + m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT; + + if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */ + return; + + u->cgroup_invalidated_mask |= m; + unit_add_to_cgroup_realize_queue(u); +} + +void unit_invalidate_cgroup_bpf(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */ + return; + + u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + unit_add_to_cgroup_realize_queue(u); + + /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access + * list of our children includes our own. */ + if (u->type == UNIT_SLICE) { + Unit *member; + void *v; + + HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE]) + if (UNIT_DEREF(member->slice) == u) + unit_invalidate_cgroup_bpf(member); + } +} + +bool unit_cgroup_delegate(Unit *u) { + CGroupContext *c; + + assert(u); + + if (!UNIT_VTABLE(u)->can_delegate) + return false; + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return c->delegate; +} + +void manager_invalidate_startup_units(Manager *m) { + Unit *u; + + assert(m); + + SET_FOREACH(u, m->startup_units) + unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO); +} + +static int unit_get_nice(Unit *u) { + ExecContext *ec; + + ec = unit_get_exec_context(u); + return ec ? ec->nice : 0; +} + +static uint64_t unit_get_cpu_weight(Unit *u) { + ManagerState state = manager_state(u->manager); + CGroupContext *cc; + + cc = unit_get_cgroup_context(u); + return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT; +} + +int compare_job_priority(const void *a, const void *b) { + const Job *x = a, *y = b; + int nice_x, nice_y; + uint64_t weight_x, weight_y; + int ret; + + if ((ret = CMP(x->unit->type, y->unit->type)) != 0) + return -ret; + + weight_x = unit_get_cpu_weight(x->unit); + weight_y = unit_get_cpu_weight(y->unit); + + if ((ret = CMP(weight_x, weight_y)) != 0) + return -ret; + + nice_x = unit_get_nice(x->unit); + nice_y = unit_get_nice(y->unit); + + if ((ret = CMP(nice_x, nice_y)) != 0) + return ret; + + return strcmp(x->unit->id, y->unit->id); +} + +int unit_cgroup_freezer_action(Unit *u, FreezerAction action) { + _cleanup_free_ char *path = NULL; + FreezerState target, kernel = _FREEZER_STATE_INVALID; + int r; + + assert(u); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + if (!cg_freezer_supported()) + return 0; + + if (!u->cgroup_realized) + return -EBUSY; + + target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING; + + r = unit_freezer_state_kernel(u, &kernel); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m"); + + if (target == kernel) { + u->freezer_state = target; + return 0; + } + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path); + if (r < 0) + return r; + + log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing"); + + if (action == FREEZER_FREEZE) + u->freezer_state = FREEZER_FREEZING; + else + u->freezer_state = FREEZER_THAWING; + + r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + return 1; +} + +static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { + [CGROUP_DEVICE_POLICY_AUTO] = "auto", + [CGROUP_DEVICE_POLICY_CLOSED] = "closed", + [CGROUP_DEVICE_POLICY_STRICT] = "strict", +}; + +int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + _cleanup_free_ char *v = NULL; + int r; + + assert(u); + assert(cpus); + + if (!u->cgroup_path) + return -ENODATA; + + if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + + r = cg_get_attribute("cpuset", u->cgroup_path, name, &v); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL); +} + +DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); + +static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { + [FREEZER_FREEZE] = "freeze", + [FREEZER_THAW] = "thaw", +}; + +DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction); diff --git a/src/core/cgroup.h b/src/core/cgroup.h new file mode 100644 index 0000000..66f3a63 --- /dev/null +++ b/src/core/cgroup.h @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "cgroup-util.h" +#include "cpu-set-util.h" +#include "ip-address-access.h" +#include "list.h" +#include "time-util.h" + +typedef struct TasksMax { + /* If scale == 0, just use value; otherwise, value / scale. + * See tasks_max_resolve(). */ + uint64_t value; + uint64_t scale; +} TasksMax; + +#define TASKS_MAX_UNSET ((TasksMax) { .value = UINT64_MAX, .scale = 0 }) + +static inline bool tasks_max_isset(const TasksMax *tasks_max) { + return tasks_max->value != UINT64_MAX || tasks_max->scale != 0; +} + +uint64_t tasks_max_resolve(const TasksMax *tasks_max); + +typedef struct CGroupContext CGroupContext; +typedef struct CGroupDeviceAllow CGroupDeviceAllow; +typedef struct CGroupIODeviceWeight CGroupIODeviceWeight; +typedef struct CGroupIODeviceLimit CGroupIODeviceLimit; +typedef struct CGroupIODeviceLatency CGroupIODeviceLatency; +typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight; +typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth; + +typedef enum CGroupDevicePolicy { + /* When devices listed, will allow those, plus built-in ones, if none are listed will allow + * everything. */ + CGROUP_DEVICE_POLICY_AUTO, + + /* Everything forbidden, except built-in ones and listed ones. */ + CGROUP_DEVICE_POLICY_CLOSED, + + /* Everything forbidden, except for the listed devices */ + CGROUP_DEVICE_POLICY_STRICT, + + _CGROUP_DEVICE_POLICY_MAX, + _CGROUP_DEVICE_POLICY_INVALID = -1 +} CGroupDevicePolicy; + +typedef enum FreezerAction { + FREEZER_FREEZE, + FREEZER_THAW, + + _FREEZER_ACTION_MAX, + _FREEZER_ACTION_INVALID = -1, +} FreezerAction; + +struct CGroupDeviceAllow { + LIST_FIELDS(CGroupDeviceAllow, device_allow); + char *path; + bool r:1; + bool w:1; + bool m:1; +}; + +struct CGroupIODeviceWeight { + LIST_FIELDS(CGroupIODeviceWeight, device_weights); + char *path; + uint64_t weight; +}; + +struct CGroupIODeviceLimit { + LIST_FIELDS(CGroupIODeviceLimit, device_limits); + char *path; + uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; +}; + +struct CGroupIODeviceLatency { + LIST_FIELDS(CGroupIODeviceLatency, device_latencies); + char *path; + usec_t target_usec; +}; + +struct CGroupBlockIODeviceWeight { + LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights); + char *path; + uint64_t weight; +}; + +struct CGroupBlockIODeviceBandwidth { + LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths); + char *path; + uint64_t rbps; + uint64_t wbps; +}; + +struct CGroupContext { + bool cpu_accounting; + bool io_accounting; + bool blockio_accounting; + bool memory_accounting; + bool tasks_accounting; + bool ip_accounting; + + /* Configures the memory.oom.group attribute (on unified) */ + bool memory_oom_group; + + bool delegate; + CGroupMask delegate_controllers; + CGroupMask disable_controllers; + + /* For unified hierarchy */ + uint64_t cpu_weight; + uint64_t startup_cpu_weight; + usec_t cpu_quota_per_sec_usec; + usec_t cpu_quota_period_usec; + + CPUSet cpuset_cpus; + CPUSet cpuset_mems; + + uint64_t io_weight; + uint64_t startup_io_weight; + LIST_HEAD(CGroupIODeviceWeight, io_device_weights); + LIST_HEAD(CGroupIODeviceLimit, io_device_limits); + LIST_HEAD(CGroupIODeviceLatency, io_device_latencies); + + uint64_t default_memory_min; + uint64_t default_memory_low; + uint64_t memory_min; + uint64_t memory_low; + uint64_t memory_high; + uint64_t memory_max; + uint64_t memory_swap_max; + + bool default_memory_min_set; + bool default_memory_low_set; + bool memory_min_set; + bool memory_low_set; + + LIST_HEAD(IPAddressAccessItem, ip_address_allow); + LIST_HEAD(IPAddressAccessItem, ip_address_deny); + + char **ip_filters_ingress; + char **ip_filters_egress; + + /* For legacy hierarchies */ + uint64_t cpu_shares; + uint64_t startup_cpu_shares; + + uint64_t blockio_weight; + uint64_t startup_blockio_weight; + LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights); + LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths); + + uint64_t memory_limit; + + CGroupDevicePolicy device_policy; + LIST_HEAD(CGroupDeviceAllow, device_allow); + + /* Common */ + TasksMax tasks_max; + + /* Settings for systemd-oomd */ + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; + int moom_mem_pressure_limit; +}; + +/* Used when querying IP accounting data */ +typedef enum CGroupIPAccountingMetric { + CGROUP_IP_INGRESS_BYTES, + CGROUP_IP_INGRESS_PACKETS, + CGROUP_IP_EGRESS_BYTES, + CGROUP_IP_EGRESS_PACKETS, + _CGROUP_IP_ACCOUNTING_METRIC_MAX, + _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -1, +} CGroupIPAccountingMetric; + +/* Used when querying IO accounting data */ +typedef enum CGroupIOAccountingMetric { + CGROUP_IO_READ_BYTES, + CGROUP_IO_WRITE_BYTES, + CGROUP_IO_READ_OPERATIONS, + CGROUP_IO_WRITE_OPERATIONS, + _CGROUP_IO_ACCOUNTING_METRIC_MAX, + _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -1, +} CGroupIOAccountingMetric; + +typedef struct Unit Unit; +typedef struct Manager Manager; + +usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period); + +void cgroup_context_init(CGroupContext *c); +void cgroup_context_done(CGroupContext *c); +void cgroup_context_dump(Unit *u, FILE* f, const char *prefix); + +void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a); +void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w); +void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l); +void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l); +void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w); +void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b); + +int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); + +CGroupMask unit_get_own_mask(Unit *u); +CGroupMask unit_get_delegate_mask(Unit *u); +CGroupMask unit_get_members_mask(Unit *u); +CGroupMask unit_get_siblings_mask(Unit *u); +CGroupMask unit_get_ancestor_disable_mask(Unit *u); + +CGroupMask unit_get_target_mask(Unit *u); +CGroupMask unit_get_enable_mask(Unit *u); + +void unit_invalidate_cgroup_members_masks(Unit *u); + +void unit_add_family_to_cgroup_realize_queue(Unit *u); + +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); +char *unit_default_cgroup_path(const Unit *u); +int unit_set_cgroup_path(Unit *u, const char *path); +int unit_pick_cgroup_path(Unit *u); + +int unit_realize_cgroup(Unit *u); +void unit_prune_cgroup(Unit *u); +int unit_watch_cgroup(Unit *u); +int unit_watch_cgroup_memory(Unit *u); + +void unit_release_cgroup(Unit *u); +/* Releases the cgroup only if it is recursively empty. + * Returns true if the cgroup was released, false otherwise. */ +bool unit_maybe_release_cgroup(Unit *u); + +void unit_add_to_cgroup_empty_queue(Unit *u); +int unit_check_oomd_kill(Unit *u); +int unit_check_oom(Unit *u); + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path); + +int manager_setup_cgroup(Manager *m); +void manager_shutdown_cgroup(Manager *m, bool delete); + +unsigned manager_dispatch_cgroup_realize_queue(Manager *m); + +Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup); +Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid); +Unit* manager_get_unit_by_pid(Manager *m, pid_t pid); + +uint64_t unit_get_ancestor_memory_min(Unit *u); +uint64_t unit_get_ancestor_memory_low(Unit *u); + +int unit_search_main_pid(Unit *u, pid_t *ret); +int unit_watch_all_pids(Unit *u); + +int unit_synthesize_cgroup_empty_event(Unit *u); + +int unit_get_memory_current(Unit *u, uint64_t *ret); +int unit_get_tasks_current(Unit *u, uint64_t *ret); +int unit_get_cpu_usage(Unit *u, nsec_t *ret); +int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret); +int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); + +int unit_reset_cpu_accounting(Unit *u); +int unit_reset_ip_accounting(Unit *u); +int unit_reset_io_accounting(Unit *u); +int unit_reset_accounting(Unit *u); + +#define UNIT_CGROUP_BOOL(u, name) \ + ({ \ + CGroupContext *cc = unit_get_cgroup_context(u); \ + cc ? cc->name : false; \ + }) + +bool manager_owns_host_root_cgroup(Manager *m); +bool unit_has_host_root_cgroup(Unit *u); + +int manager_notify_cgroup_empty(Manager *m, const char *group); + +void unit_invalidate_cgroup(Unit *u, CGroupMask m); +void unit_invalidate_cgroup_bpf(Unit *u); + +void manager_invalidate_startup_units(Manager *m); + +const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_; +CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_; + +bool unit_cgroup_delegate(Unit *u); + +int compare_job_priority(const void *a, const void *b); + +int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name); +int unit_cgroup_freezer_action(Unit *u, FreezerAction action); + +const char* freezer_action_to_string(FreezerAction a) _const_; +FreezerAction freezer_action_from_string(const char *s) _pure_; diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c new file mode 100644 index 0000000..dd6c11a --- /dev/null +++ b/src/core/core-varlink.c @@ -0,0 +1,482 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "core-varlink.h" +#include "mkdir.h" +#include "strv.h" +#include "user-util.h" +#include "varlink.h" + +typedef struct LookupParameters { + const char *user_name; + const char *group_name; + union { + uid_t uid; + gid_t gid; + }; + const char *service; +} LookupParameters; + +static const char* const managed_oom_mode_properties[] = { + "ManagedOOMSwap", + "ManagedOOMMemoryPressure", +}; + +static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) { + assert(user_name); + assert(uid_is_valid(uid)); + assert(ret); + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(uid)), + JSON_BUILD_PAIR("realName", JSON_BUILD_STRING("Dynamic User")), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_STRING("/")), + JSON_BUILD_PAIR("shell", JSON_BUILD_STRING(NOLOGIN)), + JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("service", JSON_BUILD_STRING("io.systemd.DynamicUser")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING("dynamic")))))); +} + +static bool user_match_lookup_parameters(LookupParameters *p, const char *name, uid_t uid) { + assert(p); + + if (p->user_name && !streq(name, p->user_name)) + return false; + + if (uid_is_valid(p->uid) && uid != p->uid) + return false; + + return true; +} + +static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) { + bool use_limit = false; + CGroupContext *c; + const char *mode; + + assert(u); + assert(property); + assert(ret_v); + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return -EOPNOTSUPP; + + c = unit_get_cgroup_context(u); + if (!c) + return -EINVAL; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + /* systemd-oomd should always treat inactive units as though they didn't enable any action since they + * should not have a valid cgroup */ + mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO); + else if (streq(property, "ManagedOOMSwap")) + mode = managed_oom_mode_to_string(c->moom_swap); + else if (streq(property, "ManagedOOMMemoryPressure")) { + mode = managed_oom_mode_to_string(c->moom_mem_pressure); + use_limit = true; + } else + return -EINVAL; + + return json_build(ret_v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)), + JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)), + JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)), + JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)))); +} + +int manager_varlink_send_managed_oom_update(Unit *u) { + _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL; + CGroupContext *c; + int r; + + assert(u); + + if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->manager->managed_oom_varlink_request || !u->cgroup_path) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY); + if (r < 0) + return r; + + for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e); + if (r < 0) + return r; + + r = json_variant_append_array(&arr, e); + if (r < 0) + return r; + } + + r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr)))); + if (r < 0) + return r; + + return varlink_notify(u->manager->managed_oom_varlink_request, v); +} + +static int vl_method_subscribe_managed_oom_cgroups( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + static const UnitType supported_unit_types[] = { UNIT_SLICE, UNIT_SERVICE, UNIT_SCOPE }; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL; + Manager *m = userdata; + int r; + + assert(link); + assert(m); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + /* We only take one subscriber for this method so return an error if there's already an existing one. + * This shouldn't happen since systemd-oomd is the only client of this method. */ + if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink_request) + return varlink_error(m->managed_oom_varlink_request, VARLINK_ERROR_SUBSCRIPTION_TAKEN, NULL); + + r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY); + if (r < 0) + return r; + + for (size_t i = 0; i < ELEMENTSOF(supported_unit_types); i++) { + Unit *u; + + LIST_FOREACH(units_by_type, u, m->units_by_type[supported_unit_types[i]]) { + CGroupContext *c; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + continue; + + c = unit_get_cgroup_context(u); + if (!c) + continue; + + for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + /* For the initial varlink call we only care about units that enabled (i.e. mode is not + * set to "auto") oomd properties. */ + if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) && + !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL)) + continue; + + r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e); + if (r < 0) + return r; + + r = json_variant_append_array(&arr, e); + if (r < 0) + return r; + } + } + } + + r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr)))); + if (r < 0) + return r; + + if (!FLAGS_SET(flags, VARLINK_METHOD_MORE)) + return varlink_reply(link, v); + + m->managed_oom_varlink_request = varlink_ref(link); + return varlink_notify(m->managed_oom_varlink_request, v); +} + +static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 }, + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .uid = UID_INVALID, + }; + _cleanup_free_ char *found_name = NULL; + uid_t found_uid = UID_INVALID, uid; + Manager *m = userdata; + const char *un; + int r; + + assert(parameters); + assert(m); + + r = json_dispatch(parameters, dispatch_table, NULL, 0, &p); + if (r < 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.DynamicUser")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (uid_is_valid(p.uid)) + r = dynamic_user_lookup_uid(m, p.uid, &found_name); + else if (p.user_name) + r = dynamic_user_lookup_name(m, p.user_name, &found_uid); + else { + DynamicUser *d; + + HASHMAP_FOREACH(d, m->dynamic_users) { + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) /* not realized yet? */ + continue; + if (r < 0) + return r; + + if (!user_match_lookup_parameters(&p, d->name, uid)) + continue; + + if (v) { + r = varlink_notify(link, v); + if (r < 0) + return r; + + v = json_variant_unref(v); + } + + r = build_user_json(d->name, uid, &v); + if (r < 0) + return r; + } + + if (!v) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + + return varlink_reply(link, v); + } + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + uid = uid_is_valid(found_uid) ? found_uid : p.uid; + un = found_name ?: p.user_name; + + if (!user_match_lookup_parameters(&p, un, uid)) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_user_json(un, uid, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret) { + assert(group_name); + assert(gid_is_valid(gid)); + assert(ret); + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(group_name)), + JSON_BUILD_PAIR("description", JSON_BUILD_STRING("Dynamic Group")), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)), + JSON_BUILD_PAIR("service", JSON_BUILD_STRING("io.systemd.DynamicUser")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING("dynamic")))))); + } + +static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) { + assert(p); + + if (p->group_name && !streq(name, p->group_name)) + return false; + + if (gid_is_valid(p->gid) && gid != p->gid) + return false; + + return true; +} + +static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .gid = GID_INVALID, + }; + _cleanup_free_ char *found_name = NULL; + uid_t found_gid = GID_INVALID, gid; + Manager *m = userdata; + const char *gn; + int r; + + assert(parameters); + assert(m); + + r = json_dispatch(parameters, dispatch_table, NULL, 0, &p); + if (r < 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.DynamicUser")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (gid_is_valid(p.gid)) + r = dynamic_user_lookup_uid(m, (uid_t) p.gid, &found_name); + else if (p.group_name) + r = dynamic_user_lookup_name(m, p.group_name, (uid_t*) &found_gid); + else { + DynamicUser *d; + + HASHMAP_FOREACH(d, m->dynamic_users) { + uid_t uid; + + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) + continue; + if (r < 0) + return r; + + if (!group_match_lookup_parameters(&p, d->name, (gid_t) uid)) + continue; + + if (v) { + r = varlink_notify(link, v); + if (r < 0) + return r; + + v = json_variant_unref(v); + } + + r = build_group_json(d->name, (gid_t) uid, &v); + if (r < 0) + return r; + } + + if (!v) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + + return varlink_reply(link, v); + } + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + gid = gid_is_valid(found_gid) ? found_gid : p.gid; + gn = found_name ?: p.group_name; + + if (!group_match_lookup_parameters(&p, gn, gid)) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_group_json(gn, gid, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + LookupParameters p = {}; + int r; + + assert(parameters); + + r = json_dispatch(parameters, dispatch_table, NULL, 0, &p); + if (r < 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.DynamicUser")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + /* We don't support auxiliary groups with dynamic users. */ + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) { + Manager *m = userdata; + + assert(m); + assert(s); + assert(link); + + if (link == m->managed_oom_varlink_request) + m->managed_oom_varlink_request = varlink_unref(link); +} + +int manager_varlink_init(Manager *m) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + + if (m->varlink_server) + return 0; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID); + if (r < 0) + return log_error_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_bind_method_many( + s, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships, + "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups); + if (r < 0) + return log_error_errno(r, "Failed to register varlink methods: %m"); + + r = varlink_server_bind_disconnect(s, vl_disconnect); + if (r < 0) + return log_error_errno(r, "Failed to register varlink disconnect handler: %m"); + + if (!MANAGER_IS_TEST_RUN(m)) { + (void) mkdir_p_label("/run/systemd/userdb", 0755); + + r = varlink_server_listen_address(s, "/run/systemd/userdb/io.systemd.DynamicUser", 0666); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM, 0666); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + } + + r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + m->varlink_server = TAKE_PTR(s); + return 0; +} + +void manager_varlink_done(Manager *m) { + assert(m); + + /* Send the final message if we still have a subscribe request open. */ + if (m->managed_oom_varlink_request) + m->managed_oom_varlink_request = varlink_close_unref(m->managed_oom_varlink_request); + + m->varlink_server = varlink_server_unref(m->varlink_server); +} diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h new file mode 100644 index 0000000..20507a4 --- /dev/null +++ b/src/core/core-varlink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "manager.h" + +int manager_varlink_init(Manager *m); +void manager_varlink_done(Manager *m); + +/* The manager is expected to send an update to systemd-oomd if one of the following occurs: + * - The value of ManagedOOM*= properties change + * - A unit with ManagedOOM*= properties changes unit active state */ +int manager_varlink_send_managed_oom_update(Unit *u); diff --git a/src/core/dbus-automount.c b/src/core/dbus-automount.c new file mode 100644 index 0000000..3f74488 --- /dev/null +++ b/src/core/dbus-automount.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "automount.h" +#include "bus-get-properties.h" +#include "dbus-automount.h" +#include "dbus-util.h" +#include "string-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, automount_result, AutomountResult); + +const sd_bus_vtable bus_automount_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Automount, where), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Automount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Automount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutIdleUSec", "t", bus_property_get_usec, offsetof(Automount, timeout_idle_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int bus_automount_set_transient_property( + Automount *a, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(a); + + assert(a); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Where")) + return bus_set_transient_path(u, name, &a->where, message, flags, error); + + if (streq(name, "TimeoutIdleUSec")) + return bus_set_transient_usec_fix_0(u, name, &a->timeout_idle_usec, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &a->directory_mode, message, flags, error); + + return 0; +} + +int bus_automount_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Automount *a = AUTOMOUNT(u); + + assert(a); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) /* This is a transient unit? let's load a little more */ + return bus_automount_set_transient_property(a, name, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-automount.h b/src/core/dbus-automount.h new file mode 100644 index 0000000..cfceaec --- /dev/null +++ b/src/core/dbus-automount.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_automount_vtable[]; + +int bus_automount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c new file mode 100644 index 0000000..37c581f --- /dev/null +++ b/src/core/dbus-cgroup.c @@ -0,0 +1,1718 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <arpa/inet.h> + +#include "af-list.h" +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-get-properties.h" +#include "cgroup-util.h" +#include "cgroup.h" +#include "core-varlink.h" +#include "dbus-cgroup.h" +#include "dbus-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "limits-util.h" +#include "path-util.h" + +BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve); + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode); + +static int property_get_cgroup_mask( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupMask *mask = userdata; + CGroupController ctrl; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) { + if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0) + continue; + + r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_delegate_controllers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + if (!c->delegate) + return sd_bus_message_append(reply, "as", 0); + + return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error); +} + +static int property_get_cpuset( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CPUSet *cpus = userdata; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(cpus); + + (void) cpu_set_to_dbus(cpus, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_io_device_weight( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupIODeviceWeight *w; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_weights, w, c->io_device_weights) { + r = sd_bus_message_append(reply, "(st)", w->path, w->weight); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_io_device_limits( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupIODeviceLimit *l; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_limits, l, c->io_device_limits) { + CGroupIOLimitType type; + + type = cgroup_io_limit_type_from_string(property); + if (type < 0 || l->limits[type] == cgroup_io_limit_defaults[type]) + continue; + + r = sd_bus_message_append(reply, "(st)", l->path, l->limits[type]); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_io_device_latency( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupIODeviceLatency *l; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_latencies, l, c->io_device_latencies) { + r = sd_bus_message_append(reply, "(st)", l->path, l->target_usec); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_blockio_device_weight( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupBlockIODeviceWeight *w; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + r = sd_bus_message_append(reply, "(st)", w->path, w->weight); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_blockio_device_bandwidths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupBlockIODeviceBandwidth *b; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + uint64_t v; + + if (streq(property, "BlockIOReadBandwidth")) + v = b->rbps; + else + v = b->wbps; + + if (v == CGROUP_LIMIT_MAX) + continue; + + r = sd_bus_message_append(reply, "(st)", b->path, v); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_device_allow( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + CGroupDeviceAllow *a; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(device_allow, a, c->device_allow) { + unsigned k = 0; + char rwm[4]; + + if (a->r) + rwm[k++] = 'r'; + if (a->w) + rwm[k++] = 'w'; + if (a->m) + rwm[k++] = 'm'; + + rwm[k] = 0; + + r = sd_bus_message_append(reply, "(ss)", a->path, rwm); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_ip_address_access( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + IPAddressAccessItem** items = userdata, *i; + int r; + + r = sd_bus_message_open_container(reply, 'a', "(iayu)"); + if (r < 0) + return r; + + LIST_FOREACH(items, i, *items) { + + r = sd_bus_message_open_container(reply, 'r', "iayu"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", i->family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family)); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), + SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0), + SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0), + SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0), + SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0), + SD_BUS_PROPERTY("CPUShares", "t", NULL, offsetof(CGroupContext, cpu_shares), 0), + SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0), + SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0), + SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0), + SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0), + SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0), + SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0), + SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0), + SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0), + SD_BUS_PROPERTY("IODeviceWeight", "a(st)", property_get_io_device_weight, 0, 0), + SD_BUS_PROPERTY("IOReadBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOWriteBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOReadIOPSMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IOWriteIOPSMax", "a(st)", property_get_io_device_limits, 0, 0), + SD_BUS_PROPERTY("IODeviceLatencyTargetUSec", "a(st)", property_get_io_device_latency, 0, 0), + SD_BUS_PROPERTY("BlockIOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, blockio_accounting), 0), + SD_BUS_PROPERTY("BlockIOWeight", "t", NULL, offsetof(CGroupContext, blockio_weight), 0), + SD_BUS_PROPERTY("StartupBlockIOWeight", "t", NULL, offsetof(CGroupContext, startup_blockio_weight), 0), + SD_BUS_PROPERTY("BlockIODeviceWeight", "a(st)", property_get_blockio_device_weight, 0, 0), + SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), + SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0), + SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0), + SD_BUS_PROPERTY("DefaultMemoryLow", "t", NULL, offsetof(CGroupContext, default_memory_low), 0), + SD_BUS_PROPERTY("DefaultMemoryMin", "t", NULL, offsetof(CGroupContext, default_memory_min), 0), + SD_BUS_PROPERTY("MemoryMin", "t", NULL, offsetof(CGroupContext, memory_min), 0), + SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0), + SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0), + SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0), + SD_BUS_PROPERTY("MemorySwapMax", "t", NULL, offsetof(CGroupContext, memory_swap_max), 0), + SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0), + SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0), + SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0), + SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0), + SD_BUS_PROPERTY("TasksMax", "t", bus_property_get_tasks_max, offsetof(CGroupContext, tasks_max), 0), + SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0), + SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0), + SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0), + SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0), + SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0), + SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPercent", "s", bus_property_get_percent, offsetof(CGroupContext, moom_mem_pressure_limit), 0), + SD_BUS_VTABLE_END +}; + +static int bus_cgroup_set_transient_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Delegate")) { + int b; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->delegate = b; + c->delegate_controllers = b ? _CGROUP_MASK_ALL : 0; + + unit_write_settingf(u, flags, name, "Delegate=%s", yes_no(b)); + } + + return 1; + + } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) { + CGroupMask mask = 0; + + if (streq(name, "DelegateControllers") && !UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + CGroupController cc; + const char *t; + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + if (r == 0) + break; + + cc = cgroup_controller_from_string(t); + if (cc < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown cgroup controller '%s'", t); + + mask |= CGROUP_CONTROLLER_TO_MASK(cc); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *t = NULL; + + r = cg_mask_to_string(mask, &t); + if (r < 0) + return r; + + if (streq(name, "DelegateControllers")) { + + c->delegate = true; + if (mask == 0) + c->delegate_controllers = 0; + else + c->delegate_controllers |= mask; + + unit_write_settingf(u, flags, name, "Delegate=%s", strempty(t)); + + } else if (streq(name, "DisableControllers")) { + + if (mask == 0) + c->disable_controllers = 0; + else + c->disable_controllers |= mask; + + unit_write_settingf(u, flags, name, "%s=%s", name, strempty(t)); + } + } + + return 1; + } else if (STR_IN_SET(name, "IPIngressFilterPath", "IPEgressFilterPath")) { + char ***filters; + size_t n = 0; + + filters = streq(name, "IPIngressFilterPath") ? &c->ip_filters_ingress : &c->ip_filters_egress; + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *path; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + if (r == 0) + break; + + if (!path_is_normalized(path) || !path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects a normalized absolute path.", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && !strv_contains(*filters, path)) { + r = strv_extend(filters, path); + if (r < 0) + return log_oom(); + } + n++; + } + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + char **entry; + size_t size = 0; + + if (n == 0) + *filters = strv_free(*filters); + + unit_invalidate_cgroup_bpf(u); + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs(name, f); + fputs("=\n", f); + + STRV_FOREACH(entry, *filters) + fprintf(f, "%s=%s\n", name, *entry); + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + + if (*filters) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + static bool warned = false; + + log_full(warned ? LOG_DEBUG : LOG_WARNING, + "Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n" + "Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id); + warned = true; + } + } + } + + return 1; + } + + return 0; +} + +static int bus_cgroup_set_boolean( + Unit *u, + const char *name, + bool *p, + CGroupMask mask, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int b, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = b; + unit_invalidate_cgroup(u, mask); + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(b)); + } + + return 1; +} + +#define BUS_DEFINE_SET_CGROUP_WEIGHT(function, mask, check, val) \ + static int bus_cgroup_set_##function( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "t", &v); \ + if (r < 0) \ + return r; \ + \ + if (!check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, mask); \ + \ + if (v == (val)) \ + unit_write_settingf(u, flags, name, \ + "%s=", name); \ + else \ + unit_write_settingf(u, flags, name, \ + "%s=%" PRIu64, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_CGROUP_LIMIT(function, mask, scale, minimum) \ + static int bus_cgroup_set_##function( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "t", &v); \ + if (r < 0) \ + return r; \ + \ + if (v < minimum) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, mask); \ + \ + if (v == CGROUP_LIMIT_MAX) \ + unit_write_settingf(u, flags, name, \ + "%s=infinity", name); \ + else \ + unit_write_settingf(u, flags, name, \ + "%s=%" PRIu64, name, v); \ + } \ + \ + return 1; \ + } \ + static int bus_cgroup_set_##function##_scale( \ + Unit *u, \ + const char *name, \ + uint64_t *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + uint64_t v; \ + uint32_t raw; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "u", &raw); \ + if (r < 0) \ + return r; \ + \ + v = scale(raw, UINT32_MAX); \ + if (v < minimum || v >= UINT64_MAX) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Value specified in %s is out of range", name); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_invalidate_cgroup(u, mask); \ + \ + /* Prepare to chop off suffix */ \ + assert_se(endswith(name, "Scale")); \ + \ + uint32_t scaled = DIV_ROUND_UP((uint64_t) raw * 1000, (uint64_t) UINT32_MAX); \ + unit_write_settingf(u, flags, name, "%.*s=%" PRIu32 ".%" PRIu32 "%%", \ + (int)(strlen(name) - strlen("Scale")), name, \ + scaled / 10, scaled % 10); \ + } \ + \ + return 1; \ + } + +DISABLE_WARNING_TYPE_LIMITS; +BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_weight, CGROUP_MASK_CPU, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_shares, CGROUP_MASK_CPU, CGROUP_CPU_SHARES_IS_OK, CGROUP_CPU_SHARES_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(io_weight, CGROUP_MASK_IO, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEIGHT_IS_OK, CGROUP_BLKIO_WEIGHT_INVALID); +BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1); +BUS_DEFINE_SET_CGROUP_LIMIT(memory_protection, CGROUP_MASK_MEMORY, physical_memory_scale, 0); +BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0); +REENABLE_WARNING; + +static int bus_cgroup_set_tasks_max( + Unit *u, + const char *name, + TasksMax *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint64_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + + if (v < 1) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Value specified in %s is out of range", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = (TasksMax) { .value = v, .scale = 0 }; /* When .scale==0, .value is the absolute value */ + unit_invalidate_cgroup(u, CGROUP_MASK_PIDS); + + if (v == CGROUP_LIMIT_MAX) + unit_write_settingf(u, flags, name, + "%s=infinity", name); + else + unit_write_settingf(u, flags, name, + "%s=%" PRIu64, name, v); + } + + return 1; +} + +static int bus_cgroup_set_tasks_max_scale( + Unit *u, + const char *name, + TasksMax *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint32_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "u", &v); + if (r < 0) + return r; + + if (v < 1 || v >= UINT32_MAX) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Value specified in %s is out of range", name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = (TasksMax) { v, UINT32_MAX }; /* .scale is not 0, so this is interpreted as v/UINT32_MAX. */ + unit_invalidate_cgroup(u, CGROUP_MASK_PIDS); + + uint32_t scaled = DIV_ROUND_UP((uint64_t) v * 100U, (uint64_t) UINT32_MAX); + unit_write_settingf(u, flags, name, "%s=%" PRIu32 ".%" PRIu32 "%%", "TasksMax", + scaled / 10, scaled % 10); + } + + return 1; +} + +int bus_cgroup_set_property( + Unit *u, + CGroupContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + CGroupIOLimitType iol_type; + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "CPUAccounting")) + return bus_cgroup_set_boolean(u, name, &c->cpu_accounting, get_cpu_accounting_mask(), message, flags, error); + + if (streq(name, "CPUWeight")) + return bus_cgroup_set_cpu_weight(u, name, &c->cpu_weight, message, flags, error); + + if (streq(name, "StartupCPUWeight")) + return bus_cgroup_set_cpu_weight(u, name, &c->startup_cpu_weight, message, flags, error); + + if (streq(name, "CPUShares")) + return bus_cgroup_set_cpu_shares(u, name, &c->cpu_shares, message, flags, error); + + if (streq(name, "StartupCPUShares")) + return bus_cgroup_set_cpu_shares(u, name, &c->startup_cpu_shares, message, flags, error); + + if (streq(name, "IOAccounting")) + return bus_cgroup_set_boolean(u, name, &c->io_accounting, CGROUP_MASK_IO, message, flags, error); + + if (streq(name, "IOWeight")) + return bus_cgroup_set_io_weight(u, name, &c->io_weight, message, flags, error); + + if (streq(name, "StartupIOWeight")) + return bus_cgroup_set_io_weight(u, name, &c->startup_io_weight, message, flags, error); + + if (streq(name, "BlockIOAccounting")) + return bus_cgroup_set_boolean(u, name, &c->blockio_accounting, CGROUP_MASK_BLKIO, message, flags, error); + + if (streq(name, "BlockIOWeight")) + return bus_cgroup_set_blockio_weight(u, name, &c->blockio_weight, message, flags, error); + + if (streq(name, "StartupBlockIOWeight")) + return bus_cgroup_set_blockio_weight(u, name, &c->startup_blockio_weight, message, flags, error); + + if (streq(name, "MemoryAccounting")) + return bus_cgroup_set_boolean(u, name, &c->memory_accounting, CGROUP_MASK_MEMORY, message, flags, error); + + if (streq(name, "MemoryMin")) { + r = bus_cgroup_set_memory_protection(u, name, &c->memory_min, message, flags, error); + if (r > 0) + c->memory_min_set = true; + return r; + } + + if (streq(name, "MemoryLow")) { + r = bus_cgroup_set_memory_protection(u, name, &c->memory_low, message, flags, error); + if (r > 0) + c->memory_low_set = true; + return r; + } + + if (streq(name, "DefaultMemoryMin")) { + r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_min, message, flags, error); + if (r > 0) + c->default_memory_min_set = true; + return r; + } + + if (streq(name, "DefaultMemoryLow")) { + r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_low, message, flags, error); + if (r > 0) + c->default_memory_low_set = true; + return r; + } + + if (streq(name, "MemoryHigh")) + return bus_cgroup_set_memory(u, name, &c->memory_high, message, flags, error); + + if (streq(name, "MemorySwapMax")) + return bus_cgroup_set_swap(u, name, &c->memory_swap_max, message, flags, error); + + if (streq(name, "MemoryMax")) + return bus_cgroup_set_memory(u, name, &c->memory_max, message, flags, error); + + if (streq(name, "MemoryLimit")) + return bus_cgroup_set_memory(u, name, &c->memory_limit, message, flags, error); + + if (streq(name, "MemoryMinScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_min, message, flags, error); + if (r > 0) + c->memory_min_set = true; + return r; + } + + if (streq(name, "MemoryLowScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_low, message, flags, error); + if (r > 0) + c->memory_low_set = true; + return r; + } + + if (streq(name, "DefaultMemoryMinScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_min, message, flags, error); + if (r > 0) + c->default_memory_min_set = true; + return r; + } + + if (streq(name, "DefaultMemoryLowScale")) { + r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_low, message, flags, error); + if (r > 0) + c->default_memory_low_set = true; + return r; + } + + if (streq(name, "MemoryHighScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_high, message, flags, error); + + if (streq(name, "MemorySwapMaxScale")) + return bus_cgroup_set_swap_scale(u, name, &c->memory_swap_max, message, flags, error); + + if (streq(name, "MemoryMaxScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_max, message, flags, error); + + if (streq(name, "MemoryLimitScale")) + return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error); + + if (streq(name, "TasksAccounting")) + return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error); + + if (streq(name, "TasksMax")) + return bus_cgroup_set_tasks_max(u, name, &c->tasks_max, message, flags, error); + + if (streq(name, "TasksMaxScale")) + return bus_cgroup_set_tasks_max_scale(u, name, &c->tasks_max, message, flags, error); + + if (streq(name, "CPUQuotaPerSecUSec")) { + uint64_t u64; + + r = sd_bus_message_read(message, "t", &u64); + if (r < 0) + return r; + + if (u64 <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "CPUQuotaPerSecUSec= value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_quota_per_sec_usec = u64; + u->warned_clamping_cpu_quota_period = false; + unit_invalidate_cgroup(u, CGROUP_MASK_CPU); + + if (c->cpu_quota_per_sec_usec == USEC_INFINITY) + unit_write_setting(u, flags, "CPUQuota", "CPUQuota="); + else + /* config_parse_cpu_quota() requires an integer, so truncating division is used on + * purpose here. */ + unit_write_settingf(u, flags, "CPUQuota", + "CPUQuota=%0.f%%", + (double) (c->cpu_quota_per_sec_usec / 10000)); + } + + return 1; + + } else if (streq(name, "CPUQuotaPeriodUSec")) { + uint64_t u64; + + r = sd_bus_message_read(message, "t", &u64); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_quota_period_usec = u64; + u->warned_clamping_cpu_quota_period = false; + unit_invalidate_cgroup(u, CGROUP_MASK_CPU); + if (c->cpu_quota_period_usec == USEC_INFINITY) + unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec="); + else { + char v[FORMAT_TIMESPAN_MAX]; + unit_write_settingf(u, flags, "CPUQuotaPeriodSec", + "CPUQuotaPeriodSec=%s", + format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1)); + } + } + + return 1; + + } else if (STR_IN_SET(name, "AllowedCPUs", "AllowedMemoryNodes")) { + const void *a; + size_t n; + _cleanup_(cpu_set_reset) CPUSet new_set = {}; + + r = sd_bus_message_read_array(message, 'y', &a, &n); + if (r < 0) + return r; + + r = cpu_set_from_dbus(a, n, &new_set); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *setstr = NULL; + CPUSet *set; + + setstr = cpu_set_to_range_string(&new_set); + if (!setstr) + return -ENOMEM; + + if (streq(name, "AllowedCPUs")) + set = &c->cpuset_cpus; + else + set = &c->cpuset_mems; + + cpu_set_reset(set); + *set = new_set; + new_set = (CPUSet) {}; + + unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET); + unit_write_settingf(u, flags, name, "%s=%s", name, setstr); + } + + return 1; + + } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) { + const char *path; + unsigned n = 0; + uint64_t u64; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLimit *a = NULL, *b; + + LIST_FOREACH(device_limits, b, c->io_device_limits) { + if (path_equal(path, b->path)) { + a = b; + break; + } + } + + if (!a) { + CGroupIOLimitType type; + + a = new0(CGroupIODeviceLimit, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + a->limits[type] = cgroup_io_limit_defaults[type]; + + LIST_PREPEND(device_limits, c->io_device_limits, a); + } + + a->limits[iol_type] = u64; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLimit *a; + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size = 0; + + if (n == 0) { + LIST_FOREACH(device_limits, a, c->io_device_limits) + a->limits[iol_type] = cgroup_io_limit_defaults[iol_type]; + } + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fprintf(f, "%s=\n", name); + LIST_FOREACH(device_limits, a, c->io_device_limits) + if (a->limits[iol_type] != cgroup_io_limit_defaults[iol_type]) + fprintf(f, "%s=%s %" PRIu64 "\n", name, a->path, a->limits[iol_type]); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IODeviceWeight")) { + const char *path; + uint64_t weight; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!CGROUP_WEIGHT_IS_OK(weight) || weight == CGROUP_WEIGHT_INVALID) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IODeviceWeight= value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceWeight *a = NULL, *b; + + LIST_FOREACH(device_weights, b, c->io_device_weights) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupIODeviceWeight, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_weights, c->io_device_weights, a); + } + + a->weight = weight; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupIODeviceWeight *a; + size_t size = 0; + + if (n == 0) { + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs("IODeviceWeight=\n", f); + LIST_FOREACH(device_weights, a, c->io_device_weights) + fprintf(f, "IODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IODeviceLatencyTargetUSec")) { + const char *path; + uint64_t target; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &target)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupIODeviceLatency *a = NULL, *b; + + LIST_FOREACH(device_latencies, b, c->io_device_latencies) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupIODeviceLatency, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_latencies, c->io_device_latencies, a); + } + + a->target_usec = target; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + char ts[FORMAT_TIMESPAN_MAX]; + CGroupIODeviceLatency *a; + size_t size = 0; + + if (n == 0) { + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_IO); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs("IODeviceLatencyTargetSec=\n", f); + LIST_FOREACH(device_latencies, a, c->io_device_latencies) + fprintf(f, "IODeviceLatencyTargetSec=%s %s\n", + a->path, format_timespan(ts, sizeof(ts), a->target_usec, 1)); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) { + const char *path; + bool read = true; + unsigned n = 0; + uint64_t u64; + + if (streq(name, "BlockIOWriteBandwidth")) + read = false; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceBandwidth *a = NULL, *b; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + if (path_equal(path, b->path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupBlockIODeviceBandwidth, 1); + if (!a) + return -ENOMEM; + + a->rbps = CGROUP_LIMIT_MAX; + a->wbps = CGROUP_LIMIT_MAX; + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a); + } + + if (read) + a->rbps = u64; + else + a->wbps = u64; + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceBandwidth *a; + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size = 0; + + if (n == 0) { + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) { + if (read) + a->rbps = CGROUP_LIMIT_MAX; + else + a->wbps = CGROUP_LIMIT_MAX; + } + } + + unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + if (read) { + fputs("BlockIOReadBandwidth=\n", f); + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) + if (a->rbps != CGROUP_LIMIT_MAX) + fprintf(f, "BlockIOReadBandwidth=%s %" PRIu64 "\n", a->path, a->rbps); + } else { + fputs("BlockIOWriteBandwidth=\n", f); + LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) + if (a->wbps != CGROUP_LIMIT_MAX) + fprintf(f, "BlockIOWriteBandwidth=%s %" PRIu64 "\n", a->path, a->wbps); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "BlockIODeviceWeight")) { + const char *path; + uint64_t weight; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) { + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path); + + if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight) || weight == CGROUP_BLKIO_WEIGHT_INVALID) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "BlockIODeviceWeight= out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupBlockIODeviceWeight *a = NULL, *b; + + LIST_FOREACH(device_weights, b, c->blockio_device_weights) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupBlockIODeviceWeight, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + LIST_PREPEND(device_weights, c->blockio_device_weights, a); + } + + a->weight = weight; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupBlockIODeviceWeight *a; + size_t size = 0; + + if (n == 0) { + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs("BlockIODeviceWeight=\n", f); + LIST_FOREACH(device_weights, a, c->blockio_device_weights) + fprintf(f, "BlockIODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight); + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "DevicePolicy")) { + const char *policy; + CGroupDevicePolicy p; + + r = sd_bus_message_read(message, "s", &policy); + if (r < 0) + return r; + + p = cgroup_device_policy_from_string(policy); + if (p < 0) + return -EINVAL; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->device_policy = p; + unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); + unit_write_settingf(u, flags, name, "DevicePolicy=%s", policy); + } + + return 1; + + } else if (streq(name, "DeviceAllow")) { + const char *path, *rwm; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &path, &rwm)) > 0) { + + if (!valid_device_allow_pattern(path) || strpbrk(path, WHITESPACE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires device node or pattern"); + + if (isempty(rwm)) + rwm = "rwm"; + else if (!in_charset(rwm, "rwm")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires combination of rwm flags"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + CGroupDeviceAllow *a = NULL, *b; + + LIST_FOREACH(device_allow, b, c->device_allow) { + if (path_equal(b->path, path)) { + a = b; + break; + } + } + + if (!a) { + a = new0(CGroupDeviceAllow, 1); + if (!a) + return -ENOMEM; + + a->path = strdup(path); + if (!a->path) { + free(a); + return -ENOMEM; + } + + LIST_PREPEND(device_allow, c->device_allow, a); + } + + a->r = strchr(rwm, 'r'); + a->w = strchr(rwm, 'w'); + a->m = strchr(rwm, 'm'); + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupDeviceAllow *a; + size_t size = 0; + + if (n == 0) { + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + } + + unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs("DeviceAllow=\n", f); + LIST_FOREACH(device_allow, a, c->device_allow) + fprintf(f, "DeviceAllow=%s %s%s%s\n", a->path, a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : ""); + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_setting(u, flags, name, buf); + } + + return 1; + + } else if (streq(name, "IPAccounting")) { + int b; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->ip_accounting = b; + + unit_invalidate_cgroup_bpf(u); + unit_write_settingf(u, flags, name, "IPAccounting=%s", yes_no(b)); + } + + return 1; + + } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) { + IPAddressAccessItem **list; + size_t n = 0; + + list = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny; + + r = sd_bus_message_enter_container(message, 'a', "(iayu)"); + if (r < 0) + return r; + + for (;;) { + const void *ap; + int32_t family; + uint32_t prefixlen; + size_t an; + + r = sd_bus_message_enter_container(message, 'r', "iayu"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(message, "i", &family); + if (r < 0) + return r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects IPv4 or IPv6 addresses only.", name); + + r = sd_bus_message_read_array(message, 'y', &ap, &an); + if (r < 0) + return r; + + if (an != FAMILY_ADDRESS_SIZE(family)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IP address has wrong size for family (%s, expected %zu, got %zu)", + af_to_name(family), FAMILY_ADDRESS_SIZE(family), an); + + r = sd_bus_message_read(message, "u", &prefixlen); + if (r < 0) + return r; + + if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Prefix length %" PRIu32 " too large for address family %s.", prefixlen, af_to_name(family)); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + IPAddressAccessItem *item; + + item = new0(IPAddressAccessItem, 1); + if (!item) + return -ENOMEM; + + item->family = family; + item->prefixlen = prefixlen; + memcpy(&item->address, ap, an); + + LIST_PREPEND(items, *list, item); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + *list = ip_address_access_reduce(*list); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + IPAddressAccessItem *item; + size_t size = 0; + + if (n == 0) + *list = ip_address_access_free_all(*list); + + unit_invalidate_cgroup_bpf(u); + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs(name, f); + fputs("=\n", f); + + LIST_FOREACH(items, item, *list) { + char buffer[CONST_MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)]; + + errno = 0; + if (!inet_ntop(item->family, &item->address, buffer, sizeof(buffer))) + return errno_or_else(EINVAL); + + fprintf(f, "%s=%s/%u\n", name, buffer, item->prefixlen); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; + } + + if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) { + ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure; + ManagedOOMMode m; + const char *mode; + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = sd_bus_message_read(message, "s", &mode); + if (r < 0) + return r; + + m = managed_oom_mode_from_string(mode); + if (m < 0) + return -EINVAL; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *cgroup_mode = m; + unit_write_settingf(u, flags, name, "%s=%s", name, mode); + } + + (void) manager_varlink_send_managed_oom_update(u); + return 1; + } + + if (streq(name, "ManagedOOMMemoryPressureLimitPercent")) { + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = bus_set_transient_percent(u, name, &c->moom_mem_pressure_limit, message, flags, error); + if (r < 0) + return r; + + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + + return 1; + } + + if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB)) + return bus_cgroup_set_transient_property(u, c, name, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h new file mode 100644 index 0000000..5bf45eb --- /dev/null +++ b/src/core/dbus-cgroup.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" +#include "cgroup.h" + +extern const sd_bus_vtable bus_cgroup_vtable[]; + +int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); + +int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-device.c b/src/core/dbus-device.c new file mode 100644 index 0000000..b5e18d8 --- /dev/null +++ b/src/core/dbus-device.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-device.h" +#include "device.h" +#include "unit.h" + +const sd_bus_vtable bus_device_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("SysFSPath", "s", NULL, offsetof(Device, sysfs), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; diff --git a/src/core/dbus-device.h b/src/core/dbus-device.h new file mode 100644 index 0000000..bfb5770 --- /dev/null +++ b/src/core/dbus-device.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus-vtable.h" + +extern const sd_bus_vtable bus_device_vtable[]; diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c new file mode 100644 index 0000000..0473535 --- /dev/null +++ b/src/core/dbus-execute.c @@ -0,0 +1,3459 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/mount.h> +#include <sys/prctl.h> + +#if HAVE_SECCOMP +#include <seccomp.h> +#endif + +#include "af-list.h" +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cpu-set-util.h" +#include "dbus-execute.h" +#include "dbus-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "ioprio.h" +#include "journal-file.h" +#include "mountpoint-util.h" +#include "namespace.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "securebits-util.h" +#include "specifier.h" +#include "strv.h" +#include "syslog-util.h" +#include "unit-printf.h" +#include "user-util.h" +#include "utf8.h" + +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutput); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long); +static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio); +static BUS_DEFINE_PROPERTY_GET(property_get_mount_apivfs, "b", ExecContext, exec_context_get_effective_mount_apivfs); +static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_class, "i", ExecContext, exec_context_get_effective_ioprio, IOPRIO_PRIO_CLASS); +static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, exec_context_get_effective_ioprio, IOPRIO_PRIO_DATA); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); +static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa); + +static int property_get_environment_files( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + char **j; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'a', "(sb)"); + if (r < 0) + return r; + + STRV_FOREACH(j, c->environment_files) { + const char *fn = *j; + + r = sd_bus_message_append(reply, "(sb)", fn[0] == '-' ? fn + 1 : fn, fn[0] == '-'); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_oom_score_adjust( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + int r; + + assert(bus); + assert(reply); + assert(c); + + if (c->oom_score_adjust_set) + n = c->oom_score_adjust; + else { + _cleanup_free_ char *t = NULL; + + n = 0; + r = read_one_line_file("/proc/self/oom_score_adj", &t); + if (r < 0) + log_debug_errno(r, "Failed to read /proc/self/oom_score_adj, ignoring: %m"); + else { + r = safe_atoi32(t, &n); + if (r < 0) + log_debug_errno(r, "Failed to parse \"%s\" from /proc/self/oom_score_adj, ignoring: %m", t); + } + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_coredump_filter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + uint64_t n; + int r; + + assert(bus); + assert(reply); + assert(c); + + if (c->coredump_filter_set) + n = c->coredump_filter; + else { + _cleanup_free_ char *t = NULL; + + n = COREDUMP_FILTER_MASK_DEFAULT; + r = read_one_line_file("/proc/self/coredump_filter", &t); + if (r < 0) + log_debug_errno(r, "Failed to read /proc/self/coredump_filter, ignoring: %m"); + else { + r = safe_atoux64(t, &n); + if (r < 0) + log_debug_errno(r, "Failed to parse \"%s\" from /proc/self/coredump_filter, ignoring: %m", t); + } + } + + return sd_bus_message_append(reply, "t", n); +} + +static int property_get_nice( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->nice_set) + n = c->nice; + else { + errno = 0; + n = getpriority(PRIO_PROCESS, 0); + if (errno > 0) + n = 0; + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_cpu_sched_policy( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->cpu_sched_set) + n = c->cpu_sched_policy; + else { + n = sched_getscheduler(0); + if (n < 0) + n = SCHED_OTHER; + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_cpu_sched_priority( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int32_t n; + + assert(bus); + assert(reply); + assert(c); + + if (c->cpu_sched_set) + n = c->cpu_sched_priority; + else { + struct sched_param p = {}; + + if (sched_getparam(0, &p) >= 0) + n = p.sched_priority; + else + n = 0; + } + + return sd_bus_message_append(reply, "i", n); +} + +static int property_get_cpu_affinity( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_(cpu_set_reset) CPUSet s = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(c); + + if (c->cpu_affinity_from_numa) { + int r; + + r = numa_to_cpu_set(&c->numa_policy, &s); + if (r < 0) + return r; + } + + (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set, &array, &allocated); + + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_numa_mask( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(c); + + (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated); + + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_numa_policy( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + ExecContext *c = userdata; + int32_t policy; + + assert(bus); + assert(reply); + assert(c); + + policy = numa_policy_get_type(&c->numa_policy); + + return sd_bus_message_append_basic(reply, 'i', &policy); +} + +static int property_get_timer_slack_nsec( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + uint64_t u; + + assert(bus); + assert(reply); + assert(c); + + if (c->timer_slack_nsec != NSEC_INFINITY) + u = (uint64_t) c->timer_slack_nsec; + else + u = (uint64_t) prctl(PR_GET_TIMERSLACK); + + return sd_bus_message_append(reply, "t", u); +} + +static int property_get_syscall_filter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + int r; + +#if HAVE_SECCOMP + void *id, *val; +#endif + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->syscall_allow_list); + if (r < 0) + return r; + +#if HAVE_SECCOMP + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) { + _cleanup_free_ char *name = NULL; + const char *e = NULL; + char *s; + int num = PTR_TO_INT(val); + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + if (!name) + continue; + + if (num >= 0) { + e = seccomp_errno_or_action_to_string(num); + if (e) { + s = strjoin(name, ":", e); + if (!s) + return -ENOMEM; + } else { + r = asprintf(&s, "%s:%d", name, num); + if (r < 0) + return -ENOMEM; + } + } else + s = TAKE_PTR(name); + + r = strv_consume(&l, s); + if (r < 0) + return r; + } +#endif + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_syscall_log( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + int r; + +#if HAVE_SECCOMP + void *id, *val; +#endif + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->syscall_log_allow_list); + if (r < 0) + return r; + +#if HAVE_SECCOMP + HASHMAP_FOREACH_KEY(val, id, c->syscall_log) { + char *name = NULL; + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + if (!name) + continue; + + r = strv_consume(&l, name); + if (r < 0) + return r; + } +#endif + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_syscall_archs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + int r; + +#if HAVE_SECCOMP + void *id; +#endif + + assert(bus); + assert(reply); + assert(c); + +#if HAVE_SECCOMP + SET_FOREACH(id, c->syscall_archs) { + const char *name; + + name = seccomp_arch_to_string(PTR_TO_UINT32(id) - 1); + if (!name) + continue; + + r = strv_extend(&l, name); + if (r < 0) + return -ENOMEM; + } +#endif + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return 0; +} + +static int property_get_selinux_context( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append(reply, "(bs)", c->selinux_context_ignore, c->selinux_context); +} + +static int property_get_apparmor_profile( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append(reply, "(bs)", c->apparmor_profile_ignore, c->apparmor_profile); +} + +static int property_get_smack_process_label( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + return sd_bus_message_append(reply, "(bs)", c->smack_process_label_ignore, c->smack_process_label); +} + +static int property_get_address_families( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + _cleanup_strv_free_ char **l = NULL; + void *af; + int r; + + assert(bus); + assert(reply); + assert(c); + + r = sd_bus_message_open_container(reply, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "b", c->address_families_allow_list); + if (r < 0) + return r; + + SET_FOREACH(af, c->address_families) { + const char *name; + + name = af_to_name(PTR_TO_INT(af)); + if (!name) + continue; + + r = strv_extend(&l, name); + if (r < 0) + return -ENOMEM; + } + + strv_sort(l); + + r = sd_bus_message_append_strv(reply, l); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int property_get_working_directory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + const char *wd; + + assert(bus); + assert(reply); + assert(c); + + if (c->working_directory_home) + wd = "~"; + else + wd = c->working_directory; + + if (c->working_directory_missing_ok) + wd = strjoina("!", wd); + + return sd_bus_message_append(reply, "s", wd); +} + +static int property_get_stdio_fdname( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int fileno; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + if (streq(property, "StandardInputFileDescriptorName")) + fileno = STDIN_FILENO; + else if (streq(property, "StandardOutputFileDescriptorName")) + fileno = STDOUT_FILENO; + else { + assert(streq(property, "StandardErrorFileDescriptorName")); + fileno = STDERR_FILENO; + } + + return sd_bus_message_append(reply, "s", exec_context_fdname(c, fileno)); +} + +static int property_get_input_data( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size); +} + +static int property_get_bind_paths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + unsigned i; + bool ro; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + ro = strstr(property, "ReadOnly"); + + r = sd_bus_message_open_container(reply, 'a', "(ssbt)"); + if (r < 0) + return r; + + for (i = 0; i < c->n_bind_mounts; i++) { + + if (ro != c->bind_mounts[i].read_only) + continue; + + r = sd_bus_message_append( + reply, "(ssbt)", + c->bind_mounts[i].source, + c->bind_mounts[i].destination, + c->bind_mounts[i].ignore_enoent, + c->bind_mounts[i].recursive ? (uint64_t) MS_REC : (uint64_t) 0); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_temporary_filesystems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + unsigned i; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + for (i = 0; i < c->n_temporary_filesystems; i++) { + TemporaryFileSystem *t = c->temporary_filesystems + i; + + r = sd_bus_message_append( + reply, "(ss)", + t->path, + t->options); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_log_extra_fields( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + size_t i; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "ay"); + if (r < 0) + return r; + + for (i = 0; i < c->n_log_extra_fields; i++) { + r = sd_bus_message_append_array(reply, 'y', c->log_extra_fields[i].iov_base, c->log_extra_fields[i].iov_len); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_set_credential( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + ExecSetCredential *sc; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(say)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(sc, c->set_credentials) { + + r = sd_bus_message_open_container(reply, 'r', "say"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", sc->id); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', sc->data, sc->size); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_load_credential( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + char **i, **j; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + STRV_FOREACH_PAIR(i, j, c->load_credentials) { + r = sd_bus_message_append(reply, "(ss)", *i, *j); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_root_hash( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->root_hash, c->root_hash_size); +} + +static int property_get_root_hash_sig( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + return sd_bus_message_append_array(reply, 'y', c->root_hash_sig, c->root_hash_sig_size); +} + +static int property_get_root_image_options( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + MountOptions *m; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(mount_options, m, c->root_image_options) { + r = sd_bus_message_append(reply, "(ss)", + partition_designator_to_string(m->partition_designator), + m->options); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_mount_images( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + int r; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(ssba(ss))"); + if (r < 0) + return r; + + for (size_t i = 0; i < c->n_mount_images; i++) { + MountOptions *m; + + r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "ssba(ss)"); + if (r < 0) + return r; + r = sd_bus_message_append( + reply, "ssb", + c->mount_images[i].source, + c->mount_images[i].destination, + c->mount_images[i].ignore_enoent); + if (r < 0) + return r; + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + LIST_FOREACH(mount_options, m, c->mount_images[i].mount_options) { + r = sd_bus_message_append(reply, "(ss)", + partition_designator_to_string(m->partition_designator), + m->options); + if (r < 0) + return r; + } + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_exec_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("EnvironmentFiles", "a(sb)", property_get_environment_files, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassEnvironment", "as", NULL, offsetof(ExecContext, pass_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UnsetEnvironment", "as", NULL, offsetof(ExecContext, unset_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UMask", "u", bus_property_get_mode, offsetof(ExecContext, umask), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCPU", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCPUSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitFSIZE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitDATA", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitDATASoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSTACK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCORE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitCORESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRSS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRSSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNOFILE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitAS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitASSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNPROC", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitLOCKS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNICE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitNICESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTPRIO", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTTIME", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImageOptions", "a(ss)", property_get_root_image_options, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHash", "ay", property_get_root_hash, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHashPath", "s", NULL, offsetof(ExecContext, root_hash_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CoredumpFilter", "t", property_get_coredump_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IOSchedulingClass", "i", property_get_ioprio_class, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IOSchedulingPriority", "i", property_get_ioprio_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputData", "ay", property_get_input_data, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTYVTDisallocate", "b", bus_property_get_bool, offsetof(ExecContext, tty_vt_disallocate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogPriority", "i", bus_property_get_int, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogIdentifier", "s", NULL, offsetof(ExecContext, syslog_identifier), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogLevelPrefix", "b", bus_property_get_bool, offsetof(ExecContext, syslog_level_prefix), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogLevel", "i", property_get_syslog_level, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SyslogFacility", "i", property_get_syslog_facility, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogLevelMax", "i", bus_property_get_int, offsetof(ExecContext, log_level_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogRateLimitIntervalUSec", "t", bus_property_get_usec, offsetof(ExecContext, log_ratelimit_interval_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogRateLimitBurst", "u", bus_property_get_unsigned, offsetof(ExecContext, log_ratelimit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogExtraFields", "aay", property_get_log_extra_fields, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogNamespace", "s", NULL, offsetof(ExecContext, log_namespace), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SecureBits", "i", bus_property_get_int, offsetof(ExecContext, secure_bits), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CapabilityBoundingSet", "t", NULL, offsetof(ExecContext, capability_bounding_set), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AmbientCapabilities", "t", NULL, offsetof(ExecContext, capability_ambient_set), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LoadCredential", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadWritePaths", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadOnlyPaths", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_bool, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UtmpMode", "s", property_get_exec_utmp_mode, offsetof(ExecContext, utmp_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AppArmorProfile", "(bs)", property_get_apparmor_profile, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackProcessLabel", "(bs)", property_get_smack_process_label, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IgnoreSIGPIPE", "b", bus_property_get_bool, offsetof(ExecContext, ignore_sigpipe), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoNewPrivileges", "b", bus_property_get_bool, offsetof(ExecContext, no_new_privileges), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallFilter", "(bas)", property_get_syscall_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallErrorNumber", "i", bus_property_get_int, offsetof(ExecContext, syscall_errno), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SystemCallLog", "(bas)", property_get_syscall_log, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Personality", "s", property_get_personality, offsetof(ExecContext, personality), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StateDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CacheDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LogsDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfigurationDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConfigurationDirectory", "as", NULL, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutCleanUSec", "t", bus_property_get_usec, offsetof(ExecContext, timeout_clean_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountAPIVFS", "b", property_get_mount_apivfs, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), + + /* Obsolete/redundant properties: */ + SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("ReadOnlyDirectories", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("InaccessibleDirectories", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + + SD_BUS_VTABLE_END +}; + +static int append_exec_command(sd_bus_message *reply, ExecCommand *c) { + int r; + + assert(reply); + assert(c); + + if (!c->path) + return 0; + + r = sd_bus_message_open_container(reply, 'r', "sasbttttuii"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", c->path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, c->argv); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "bttttuii", + !!(c->flags & EXEC_COMMAND_IGNORE_FAILURE), + c->exec_status.start_timestamp.realtime, + c->exec_status.start_timestamp.monotonic, + c->exec_status.exit_timestamp.realtime, + c->exec_status.exit_timestamp.monotonic, + (uint32_t) c->exec_status.pid, + (int32_t) c->exec_status.code, + (int32_t) c->exec_status.status); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static int append_exec_ex_command(sd_bus_message *reply, ExecCommand *c) { + _cleanup_strv_free_ char **ex_opts = NULL; + int r; + + assert(reply); + assert(c); + + if (!c->path) + return 0; + + r = sd_bus_message_open_container(reply, 'r', "sasasttttuii"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", c->path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, c->argv); + if (r < 0) + return r; + + r = exec_command_flags_to_strv(c->flags, &ex_opts); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, ex_opts); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "ttttuii", + c->exec_status.start_timestamp.realtime, + c->exec_status.start_timestamp.monotonic, + c->exec_status.exit_timestamp.realtime, + c->exec_status.exit_timestamp.monotonic, + (uint32_t) c->exec_status.pid, + (int32_t) c->exec_status.code, + (int32_t) c->exec_status.status); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_command( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *c = (ExecCommand*) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)"); + if (r < 0) + return r; + + r = append_exec_command(reply, c); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_command_list( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *c = *(ExecCommand**) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)"); + if (r < 0) + return r; + + LIST_FOREACH(command, c, c) { + r = append_exec_command(reply, c); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +int bus_property_get_exec_ex_command_list( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *ret_error) { + + ExecCommand *c, *exec_command = *(ExecCommand**) userdata; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sasasttttuii)"); + if (r < 0) + return r; + + LIST_FOREACH(command, c, exec_command) { + r = append_exec_ex_command(reply, c); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static char *exec_command_flags_to_exec_chars(ExecCommandFlags flags) { + return strjoin(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE) ? "-" : "", + FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND) ? ":" : "", + FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) ? "+" : "", + FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID) ? "!" : "", + FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC) ? "!!" : ""); +} + +int bus_set_transient_exec_command( + Unit *u, + const char *name, + ExecCommand **exec_command, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + bool is_ex_prop = endswith(name, "Ex"); + unsigned n = 0; + int r; + + r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) { + _cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL; + const char *path; + int b; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + if (!path_is_absolute(path) && !filename_is_valid(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "\"%s\" is neither a valid executable name nor an absolute path", + path); + + r = sd_bus_message_read_strv(message, &argv); + if (r < 0) + return r; + + r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + ExecCommand *c; + + c = new0(ExecCommand, 1); + if (!c) + return -ENOMEM; + + c->path = strdup(path); + if (!c->path) { + free(c); + return -ENOMEM; + } + + c->argv = TAKE_PTR(argv); + + if (is_ex_prop) { + r = exec_command_flags_from_strv(ex_opts, &c->flags); + if (r < 0) + return r; + } else + c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0; + + path_simplify(c->path, false); + exec_command_append_list(exec_command, c); + } + + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + ExecCommand *c; + size_t size = 0; + + if (n == 0) + *exec_command = exec_command_free_list(*exec_command); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fprintf(f, "%s=\n", name); + + LIST_FOREACH(command, c, *exec_command) { + _cleanup_free_ char *a = NULL, *exec_chars = NULL; + + exec_chars = exec_command_flags_to_exec_chars(c->flags); + if (!exec_chars) + return -ENOMEM; + + a = unit_concat_strv(c->argv, UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS); + if (!a) + return -ENOMEM; + + if (streq_ptr(c->path, c->argv ? c->argv[0] : NULL)) + fprintf(f, "%s=%s%s\n", name, exec_chars, a); + else { + _cleanup_free_ char *t = NULL; + const char *p; + + p = unit_escape_setting(c->path, UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS, &t); + if (!p) + return -ENOMEM; + + fprintf(f, "%s=%s@%s %s\n", name, exec_chars, p, a); + } + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + } + + return 1; +} + +static int parse_personality(const char *s, unsigned long *p) { + unsigned long v; + + assert(p); + + v = personality_from_string(s); + if (v == PERSONALITY_INVALID) + return -EINVAL; + + *p = v; + return 0; +} + +static const char* mount_propagation_flags_to_string_with_check(unsigned long n) { + if (!IN_SET(n, 0, MS_SHARED, MS_PRIVATE, MS_SLAVE)) + return NULL; + + return mount_propagation_flags_to_string(n); +} + +static BUS_DEFINE_SET_TRANSIENT(nsec, "t", uint64_t, nsec_t, NSEC_FMT); +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(log_level, "i", int32_t, int, "%" PRIi32, log_level_is_valid); +#if HAVE_SECCOMP +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(errno, "i", int32_t, int, "%" PRIi32, seccomp_errno_or_action_is_valid); +#endif +static BUS_DEFINE_SET_TRANSIENT_PARSE(std_input, ExecInput, exec_input_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(std_output, ExecOutput, exec_output_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(capability, "t", uint64_t, uint64_t, "%" PRIu64, capability_set_to_string_alloc); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(namespace_flag, "t", uint64_t, unsigned long, "%" PRIu64, namespace_flags_to_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(mount_flags, "t", uint64_t, unsigned long, "%" PRIu64, mount_propagation_flags_to_string_with_check); + +/* ret_format_str is an accumulator, so if it has any pre-existing content, new options will be appended to it */ +static int read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *format_str = NULL; + const char *mount_options, *partition; + int r; + + assert(message); + assert(ret_options); + assert(ret_format_str); + assert(separator); + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &partition, &mount_options)) > 0) { + _cleanup_free_ char *previous = NULL, *escaped = NULL; + _cleanup_free_ MountOptions *o = NULL; + PartitionDesignator partition_designator; + + if (chars_intersect(mount_options, WHITESPACE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid mount options string, contains whitespace character(s): %s", mount_options); + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid partition name %s", partition); + + /* Need to store them in the unit with the escapes, so that they can be parsed again */ + escaped = shell_escape(mount_options, ":"); + if (!escaped) + return -ENOMEM; + + previous = TAKE_PTR(format_str); + format_str = strjoin(previous, previous ? separator : "", partition, ":", escaped); + if (!format_str) + return -ENOMEM; + + o = new(MountOptions, 1); + if (!o) + return -ENOMEM; + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = strdup(mount_options), + }; + if (!o->options) + return -ENOMEM; + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!LIST_IS_EMPTY(options)) { + char *final = strjoin(*ret_format_str, !isempty(*ret_format_str) ? separator : "", format_str); + if (!final) + return -ENOMEM; + free_and_replace(*ret_format_str, final); + LIST_JOIN(mount_options, *ret_options, options); + } + + return 0; +} + +int bus_exec_context_set_transient_property( + Unit *u, + ExecContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *suffix; + int r; + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "User")) + return bus_set_transient_user_relaxed(u, name, &c->user, message, flags, error); + + if (streq(name, "Group")) + return bus_set_transient_user_relaxed(u, name, &c->group, message, flags, error); + + if (streq(name, "TTYPath")) + return bus_set_transient_path(u, name, &c->tty_path, message, flags, error); + + if (streq(name, "RootImage")) + return bus_set_transient_path(u, name, &c->root_image, message, flags, error); + + if (streq(name, "RootImageOptions")) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *format_str = NULL; + + r = read_mount_options(message, error, &options, &format_str, " "); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (LIST_IS_EMPTY(options)) { + c->root_image_options = mount_options_free_all(c->root_image_options); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + LIST_JOIN(mount_options, c->root_image_options, options); + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", + name, + format_str); + } + } + + return 1; + } + + if (streq(name, "RootHash")) { + const void *roothash_decoded; + size_t roothash_decoded_size; + + r = sd_bus_message_read_array(message, 'y', &roothash_decoded, &roothash_decoded_size); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (roothash_decoded_size == 0) { + c->root_hash_path = mfree(c->root_hash_path); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + + unit_write_settingf(u, flags, name, "RootHash="); + } else { + _cleanup_free_ void *p; + + encoded = hexmem(roothash_decoded, roothash_decoded_size); + if (!encoded) + return -ENOMEM; + + p = memdup(roothash_decoded, roothash_decoded_size); + if (!p) + return -ENOMEM; + + free_and_replace(c->root_hash, p); + c->root_hash_size = roothash_decoded_size; + c->root_hash_path = mfree(c->root_hash_path); + + unit_write_settingf(u, flags, name, "RootHash=%s", encoded); + } + } + + return 1; + } + + if (streq(name, "RootHashPath")) { + c->root_hash_size = 0; + c->root_hash = mfree(c->root_hash); + + return bus_set_transient_path(u, "RootHash", &c->root_hash_path, message, flags, error); + } + + if (streq(name, "RootHashSignature")) { + const void *roothash_sig_decoded; + size_t roothash_sig_decoded_size; + + r = sd_bus_message_read_array(message, 'y', &roothash_sig_decoded, &roothash_sig_decoded_size); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (roothash_sig_decoded_size == 0) { + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + + unit_write_settingf(u, flags, name, "RootHashSignature="); + } else { + _cleanup_free_ void *p; + ssize_t len; + + len = base64mem(roothash_sig_decoded, roothash_sig_decoded_size, &encoded); + if (len < 0) + return -ENOMEM; + + p = memdup(roothash_sig_decoded, roothash_sig_decoded_size); + if (!p) + return -ENOMEM; + + free_and_replace(c->root_hash_sig, p); + c->root_hash_sig_size = roothash_sig_decoded_size; + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + + unit_write_settingf(u, flags, name, "RootHashSignature=base64:%s", encoded); + } + } + + return 1; + } + + if (streq(name, "RootHashSignaturePath")) { + c->root_hash_sig_size = 0; + c->root_hash_sig = mfree(c->root_hash_sig); + + return bus_set_transient_path(u, "RootHashSignature", &c->root_hash_sig_path, message, flags, error); + } + + if (streq(name, "RootVerity")) + return bus_set_transient_path(u, name, &c->root_verity, message, flags, error); + + if (streq(name, "RootDirectory")) + return bus_set_transient_path(u, name, &c->root_directory, message, flags, error); + + if (streq(name, "SyslogIdentifier")) + return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error); + + if (streq(name, "LogLevelMax")) + return bus_set_transient_log_level(u, name, &c->log_level_max, message, flags, error); + + if (streq(name, "LogRateLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &c->log_ratelimit_interval_usec, message, flags, error); + + if (streq(name, "LogRateLimitBurst")) + return bus_set_transient_unsigned(u, name, &c->log_ratelimit_burst, message, flags, error); + + if (streq(name, "Personality")) + return bus_set_transient_personality(u, name, &c->personality, message, flags, error); + + if (streq(name, "StandardInput")) + return bus_set_transient_std_input(u, name, &c->std_input, message, flags, error); + + if (streq(name, "StandardOutput")) + return bus_set_transient_std_output(u, name, &c->std_output, message, flags, error); + + if (streq(name, "StandardError")) + return bus_set_transient_std_output(u, name, &c->std_error, message, flags, error); + + if (streq(name, "IgnoreSIGPIPE")) + return bus_set_transient_bool(u, name, &c->ignore_sigpipe, message, flags, error); + + if (streq(name, "TTYVHangup")) + return bus_set_transient_bool(u, name, &c->tty_vhangup, message, flags, error); + + if (streq(name, "TTYReset")) + return bus_set_transient_bool(u, name, &c->tty_reset, message, flags, error); + + if (streq(name, "TTYVTDisallocate")) + return bus_set_transient_bool(u, name, &c->tty_vt_disallocate, message, flags, error); + + if (streq(name, "PrivateTmp")) + return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error); + + if (streq(name, "PrivateDevices")) + return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); + + if (streq(name, "PrivateMounts")) + return bus_set_transient_bool(u, name, &c->private_mounts, message, flags, error); + + if (streq(name, "PrivateNetwork")) + return bus_set_transient_bool(u, name, &c->private_network, message, flags, error); + + if (streq(name, "PrivateUsers")) + return bus_set_transient_bool(u, name, &c->private_users, message, flags, error); + + if (streq(name, "NoNewPrivileges")) + return bus_set_transient_bool(u, name, &c->no_new_privileges, message, flags, error); + + if (streq(name, "SyslogLevelPrefix")) + return bus_set_transient_bool(u, name, &c->syslog_level_prefix, message, flags, error); + + if (streq(name, "MemoryDenyWriteExecute")) + return bus_set_transient_bool(u, name, &c->memory_deny_write_execute, message, flags, error); + + if (streq(name, "RestrictRealtime")) + return bus_set_transient_bool(u, name, &c->restrict_realtime, message, flags, error); + + if (streq(name, "RestrictSUIDSGID")) + return bus_set_transient_bool(u, name, &c->restrict_suid_sgid, message, flags, error); + + if (streq(name, "DynamicUser")) + return bus_set_transient_bool(u, name, &c->dynamic_user, message, flags, error); + + if (streq(name, "RemoveIPC")) + return bus_set_transient_bool(u, name, &c->remove_ipc, message, flags, error); + + if (streq(name, "ProtectKernelTunables")) + return bus_set_transient_bool(u, name, &c->protect_kernel_tunables, message, flags, error); + + if (streq(name, "ProtectKernelModules")) + return bus_set_transient_bool(u, name, &c->protect_kernel_modules, message, flags, error); + + if (streq(name, "ProtectKernelLogs")) + return bus_set_transient_bool(u, name, &c->protect_kernel_logs, message, flags, error); + + if (streq(name, "ProtectClock")) + return bus_set_transient_bool(u, name, &c->protect_clock, message, flags, error); + + if (streq(name, "ProtectControlGroups")) + return bus_set_transient_bool(u, name, &c->protect_control_groups, message, flags, error); + + if (streq(name, "CPUSchedulingResetOnFork")) + return bus_set_transient_bool(u, name, &c->cpu_sched_reset_on_fork, message, flags, error); + + if (streq(name, "NonBlocking")) + return bus_set_transient_bool(u, name, &c->non_blocking, message, flags, error); + + if (streq(name, "LockPersonality")) + return bus_set_transient_bool(u, name, &c->lock_personality, message, flags, error); + + if (streq(name, "ProtectHostname")) + return bus_set_transient_bool(u, name, &c->protect_hostname, message, flags, error); + + if (streq(name, "UtmpIdentifier")) + return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error); + + if (streq(name, "UtmpMode")) + return bus_set_transient_utmp_mode(u, name, &c->utmp_mode, message, flags, error); + + if (streq(name, "PAMName")) + return bus_set_transient_string(u, name, &c->pam_name, message, flags, error); + + if (streq(name, "TimerSlackNSec")) + return bus_set_transient_nsec(u, name, &c->timer_slack_nsec, message, flags, error); + + if (streq(name, "ProtectSystem")) + return bus_set_transient_protect_system(u, name, &c->protect_system, message, flags, error); + + if (streq(name, "ProtectHome")) + return bus_set_transient_protect_home(u, name, &c->protect_home, message, flags, error); + + if (streq(name, "KeyringMode")) + return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error); + + if (streq(name, "ProtectProc")) + return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error); + + if (streq(name, "ProcSubset")) + return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error); + + if (streq(name, "RuntimeDirectoryPreserve")) + return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error); + + if (streq(name, "UMask")) + return bus_set_transient_mode_t(u, name, &c->umask, message, flags, error); + + if (streq(name, "RuntimeDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_RUNTIME].mode, message, flags, error); + + if (streq(name, "StateDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_STATE].mode, message, flags, error); + + if (streq(name, "CacheDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CACHE].mode, message, flags, error); + + if (streq(name, "LogsDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_LOGS].mode, message, flags, error); + + if (streq(name, "ConfigurationDirectoryMode")) + return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CONFIGURATION].mode, message, flags, error); + + if (streq(name, "SELinuxContext")) + return bus_set_transient_string(u, name, &c->selinux_context, message, flags, error); + + if (streq(name, "SecureBits")) + return bus_set_transient_secure_bits(u, name, &c->secure_bits, message, flags, error); + + if (streq(name, "CapabilityBoundingSet")) + return bus_set_transient_capability(u, name, &c->capability_bounding_set, message, flags, error); + + if (streq(name, "AmbientCapabilities")) + return bus_set_transient_capability(u, name, &c->capability_ambient_set, message, flags, error); + + if (streq(name, "RestrictNamespaces")) + return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error); + + if (streq(name, "MountFlags")) + return bus_set_transient_mount_flags(u, name, &c->mount_flags, message, flags, error); + + if (streq(name, "NetworkNamespacePath")) + return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error); + + if (streq(name, "SupplementaryGroups")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) + if (!isempty(*p) && !valid_user_group_name(*p, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid supplementary group names"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->supplementary_groups = strv_free(c->supplementary_groups); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&c->supplementary_groups, l, true); + if (r < 0) + return -ENOMEM; + + joined = strv_join(c->supplementary_groups, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (streq(name, "SetCredential")) { + bool isempty = true; + + r = sd_bus_message_enter_container(message, 'a', "(say)"); + if (r < 0) + return r; + + for (;;) { + const char *id; + const void *p; + size_t sz; + + r = sd_bus_message_enter_container(message, 'r', "say"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(message, "s", &id); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!credential_name_valid(id)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id); + + isempty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *a = NULL, *b = NULL; + _cleanup_free_ void *copy = NULL; + ExecSetCredential *old; + + copy = memdup(p, sz); + if (!copy) + return -ENOMEM; + + old = hashmap_get(c->set_credentials, id); + if (old) { + free_and_replace(old->data, copy); + old->size = sz; + } else { + _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL; + + sc = new0(ExecSetCredential, 1); + if (!sc) + return -ENOMEM; + + sc->id = strdup(id); + if (!sc->id) + return -ENOMEM; + + sc->data = TAKE_PTR(copy); + sc->size = sz; + + r = hashmap_ensure_allocated(&c->set_credentials, &exec_set_credential_hash_ops); + if (r < 0) + return r; + + r = hashmap_put(c->set_credentials, sc->id, sc); + if (r < 0) + return r; + + TAKE_PTR(sc); + } + + a = specifier_escape(id); + if (!a) + return -ENOMEM; + + b = cescape_length(p, sz); + if (!b) + return -ENOMEM; + + (void) unit_write_settingf(u, flags, name, "%s=%s:%s", name, a, b); + } + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) { + c->set_credentials = hashmap_free(c->set_credentials); + (void) unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "LoadCredential")) { + bool isempty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + for (;;) { + const char *id, *source; + + r = sd_bus_message_read(message, "(ss)", &id, &source); + if (r < 0) + return r; + if (r == 0) + break; + + if (!credential_name_valid(id)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id); + + if (!(path_is_absolute(source) ? path_is_normalized(source) : credential_name_valid(source))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential source is invalid: %s", source); + + isempty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = strv_extend_strv(&c->load_credentials, STRV_MAKE(id, source), /* filter_duplicates = */ false); + if (r < 0) + return r; + + (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s:%s", name, id, source); + } + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) { + c->load_credentials = strv_free(c->load_credentials); + (void) unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "SyslogLevel")) { + int32_t level; + + r = sd_bus_message_read(message, "i", &level); + if (r < 0) + return r; + + if (!log_level_is_valid(level)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Log level value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->syslog_priority = (c->syslog_priority & LOG_FACMASK) | level; + unit_write_settingf(u, flags, name, "SyslogLevel=%i", level); + } + + return 1; + + } else if (streq(name, "SyslogFacility")) { + int32_t facility; + + r = sd_bus_message_read(message, "i", &facility); + if (r < 0) + return r; + + if (!log_facility_unshifted_is_valid(facility)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Log facility value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->syslog_priority = (facility << 3) | LOG_PRI(c->syslog_priority); + unit_write_settingf(u, flags, name, "SyslogFacility=%i", facility); + } + + return 1; + + } else if (streq(name, "LogNamespace")) { + const char *n; + + r = sd_bus_message_read(message, "s", &n); + if (r < 0) + return r; + + if (!isempty(n) && !log_namespace_name_valid(n)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Log namespace name not valid"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (isempty(n)) { + c->log_namespace = mfree(c->log_namespace); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + r = free_and_strdup(&c->log_namespace, n); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, n); + } + } + + return 1; + + } else if (streq(name, "LogExtraFields")) { + size_t n = 0; + + r = sd_bus_message_enter_container(message, 'a', "ay"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ void *copy = NULL; + struct iovec *t; + const char *eq; + const void *p; + size_t sz; + + /* Note that we expect a byte array for each field, instead of a string. That's because on the + * lower-level journal fields can actually contain binary data and are not restricted to text, + * and we should not "lose precision" in our types on the way. That said, I am pretty sure + * actually encoding binary data as unit metadata is not a good idea. Hence we actually refuse + * any actual binary data, and only accept UTF-8. This allows us to eventually lift this + * limitation, should a good, valid usecase arise. */ + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + if (r == 0) + break; + + if (memchr(p, 0, sz)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains zero byte"); + + eq = memchr(p, '=', sz); + if (!eq) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains no '=' character"); + if (!journal_field_valid(p, eq - (const char*) p, false)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field invalid"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec)); + if (!t) + return -ENOMEM; + c->log_extra_fields = t; + } + + copy = malloc(sz + 1); + if (!copy) + return -ENOMEM; + + memcpy(copy, p, sz); + ((uint8_t*) copy)[sz] = 0; + + if (!utf8_is_valid(copy)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field is not valid UTF-8"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE(copy, sz); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C, name, "LogExtraFields=%s", (char*) copy); + + copy = NULL; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && n == 0) { + exec_context_free_log_extra_fields(c); + unit_write_setting(u, flags, name, "LogExtraFields="); + } + + return 1; + } + +#if HAVE_SECCOMP + + if (streq(name, "SystemCallErrorNumber")) + return bus_set_transient_errno(u, name, &c->syscall_errno, message, flags, error); + + if (streq(name, "SystemCallFilter")) { + int allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT; + char **s; + + if (strv_isempty(l)) { + c->syscall_allow_list = false; + c->syscall_filter = hashmap_free(c->syscall_filter); + + unit_write_settingf(u, flags, name, "SystemCallFilter="); + return 1; + } + + if (!c->syscall_filter) { + c->syscall_filter = hashmap_new(NULL); + if (!c->syscall_filter) + return log_oom(); + + c->syscall_allow_list = allow_list; + + if (c->syscall_allow_list) { + r = seccomp_parse_syscall_filter("@default", + -1, + c->syscall_filter, + SECCOMP_PARSE_PERMISSIVE | + SECCOMP_PARSE_ALLOW_LIST | invert_flag, + u->id, + NULL, 0); + if (r < 0) + return r; + } + } + + STRV_FOREACH(s, l) { + _cleanup_free_ char *n = NULL; + int e; + + r = parse_syscall_and_errno(*s, &n, &e); + if (r < 0) + return r; + + r = seccomp_parse_syscall_filter(n, + e, + c->syscall_filter, + SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE | + invert_flag | + (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + u->id, + NULL, 0); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "SystemCallFilter=%s%s", allow_list ? "" : "~", joined); + } + + return 1; + + } else if (streq(name, "SystemCallLog")) { + int allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT; + char **s; + + if (strv_isempty(l)) { + c->syscall_log_allow_list = false; + c->syscall_log = hashmap_free(c->syscall_log); + + unit_write_settingf(u, flags, name, "SystemCallLog="); + return 1; + } + + if (!c->syscall_log) { + c->syscall_log = hashmap_new(NULL); + if (!c->syscall_log) + return log_oom(); + + c->syscall_log_allow_list = allow_list; + } + + STRV_FOREACH(s, l) { + _cleanup_free_ char *n = NULL; + int e; + + r = parse_syscall_and_errno(*s, &n, &e); + if (r < 0) + return r; + + r = seccomp_parse_syscall_filter(n, + 0, /* errno not used */ + c->syscall_log, + SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE | + invert_flag | + (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + u->id, + NULL, 0); + if (r < 0) + return r; + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "SystemCallLog=%s%s", allow_list ? "" : "~", joined); + } + + return 1; + + } else if (streq(name, "SystemCallArchitectures")) { + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + + if (strv_isempty(l)) + c->syscall_archs = set_free(c->syscall_archs); + else { + char **s; + + STRV_FOREACH(s, l) { + uint32_t a; + + r = seccomp_arch_from_string(*s, &a); + if (r < 0) + return r; + + r = set_ensure_put(&c->syscall_archs, NULL, UINT32_TO_PTR(a + 1)); + if (r < 0) + return r; + } + + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + + return 1; + + } else if (streq(name, "RestrictAddressFamilies")) { + int allow_list; + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_enter_container(message, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &allow_list); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = NULL; + char **s; + + if (strv_isempty(l)) { + c->address_families_allow_list = false; + c->address_families = set_free(c->address_families); + + unit_write_settingf(u, flags, name, "RestrictAddressFamilies="); + return 1; + } + + if (!c->address_families) { + c->address_families = set_new(NULL); + if (!c->address_families) + return log_oom(); + + c->address_families_allow_list = allow_list; + } + + STRV_FOREACH(s, l) { + int af; + + af = af_from_name(*s); + if (af < 0) + return af; + + if (allow_list == c->address_families_allow_list) { + r = set_put(c->address_families, INT_TO_PTR(af)); + if (r < 0) + return r; + } else + (void) set_remove(c->address_families, INT_TO_PTR(af)); + } + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s%s", allow_list ? "" : "~", joined); + } + + return 1; + } +#endif + if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) { + const void *a; + size_t n; + bool affinity = streq(name, "CPUAffinity"); + _cleanup_(cpu_set_reset) CPUSet set = {}; + + r = sd_bus_message_read_array(message, 'y', &a, &n); + if (r < 0) + return r; + + r = cpu_set_from_dbus(a, n, &set); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (n == 0) { + cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *str = NULL; + + str = cpu_set_to_string(&set); + if (!str) + return -ENOMEM; + + /* We forego any optimizations here, and always create the structure using + * cpu_set_add_all(), because we don't want to care if the existing size we + * got over dbus is appropriate. */ + r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, str); + } + } + + return 1; + + } else if (streq(name, "CPUAffinityFromNUMA")) { + int q; + + r = sd_bus_message_read_basic(message, 'b', &q); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_affinity_from_numa = q; + unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa"); + } + + return 1; + + } else if (streq(name, "NUMAPolicy")) { + int32_t type; + + r = sd_bus_message_read(message, "i", &type); + if (r < 0) + return r; + + if (!mpol_is_valid(type)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + c->numa_policy.type = type; + + return 1; + + } else if (streq(name, "Nice")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!nice_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid Nice value: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->nice = q; + c->nice_set = true; + + unit_write_settingf(u, flags, name, "Nice=%i", q); + } + + return 1; + + } else if (streq(name, "CPUSchedulingPolicy")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!sched_policy_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling policy: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *s = NULL; + + r = sched_policy_to_string_alloc(q, &s); + if (r < 0) + return r; + + c->cpu_sched_policy = q; + c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(q), sched_get_priority_max(q)); + c->cpu_sched_set = true; + + unit_write_settingf(u, flags, name, "CPUSchedulingPolicy=%s", s); + } + + return 1; + + } else if (streq(name, "CPUSchedulingPriority")) { + int32_t p, min, max; + + r = sd_bus_message_read(message, "i", &p); + if (r < 0) + return r; + + min = sched_get_priority_min(c->cpu_sched_policy); + max = sched_get_priority_max(c->cpu_sched_policy); + if (p < min || p > max) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling priority: %i", p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_sched_priority = p; + c->cpu_sched_set = true; + + unit_write_settingf(u, flags, name, "CPUSchedulingPriority=%i", p); + } + + return 1; + + } else if (streq(name, "IOSchedulingClass")) { + int32_t q; + + r = sd_bus_message_read(message, "i", &q); + if (r < 0) + return r; + + if (!ioprio_class_is_valid(q)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling class: %i", q); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *s = NULL; + + r = ioprio_class_to_string_alloc(q, &s); + if (r < 0) + return r; + + c->ioprio = IOPRIO_PRIO_VALUE(q, IOPRIO_PRIO_DATA(c->ioprio)); + c->ioprio_set = true; + + unit_write_settingf(u, flags, name, "IOSchedulingClass=%s", s); + } + + return 1; + + } else if (streq(name, "IOSchedulingPriority")) { + int32_t p; + + r = sd_bus_message_read(message, "i", &p); + if (r < 0) + return r; + + if (!ioprio_priority_is_valid(p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling priority: %i", p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_PRIO_CLASS(c->ioprio), p); + c->ioprio_set = true; + + unit_write_settingf(u, flags, name, "IOSchedulingPriority=%i", p); + } + + return 1; + + } else if (streq(name, "MountAPIVFS")) { + bool b; + + r = bus_set_transient_bool(u, name, &b, message, flags, error); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->mount_apivfs = b; + c->mount_apivfs_set = true; + } + + return 1; + + } else if (streq(name, "WorkingDirectory")) { + const char *s; + bool missing_ok; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (s[0] == '-') { + missing_ok = true; + s++; + } else + missing_ok = false; + + if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (streq(s, "~")) { + c->working_directory = mfree(c->working_directory); + c->working_directory_home = true; + } else { + r = free_and_strdup(&c->working_directory, empty_to_null(s)); + if (r < 0) + return r; + + c->working_directory_home = false; + } + + c->working_directory_missing_ok = missing_ok; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s); + } + + return 1; + + } else if (STR_IN_SET(name, + "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s) && !fdname_is_valid(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (streq(name, "StandardInputFileDescriptorName")) { + r = free_and_strdup(c->stdio_fdname + STDIN_FILENO, empty_to_null(s)); + if (r < 0) + return r; + + c->std_input = EXEC_INPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=fd:%s", exec_context_fdname(c, STDIN_FILENO)); + + } else if (streq(name, "StandardOutputFileDescriptorName")) { + r = free_and_strdup(c->stdio_fdname + STDOUT_FILENO, empty_to_null(s)); + if (r < 0) + return r; + + c->std_output = EXEC_OUTPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=fd:%s", exec_context_fdname(c, STDOUT_FILENO)); + + } else { + assert(streq(name, "StandardErrorFileDescriptorName")); + + r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + c->std_error = EXEC_OUTPUT_NAMED_FD; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=fd:%s", exec_context_fdname(c, STDERR_FILENO)); + } + } + + return 1; + + } else if (STR_IN_SET(name, + "StandardInputFile", + "StandardOutputFile", "StandardOutputFileToAppend", + "StandardErrorFile", "StandardErrorFileToAppend")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s)) { + if (!path_is_absolute(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute", s); + if (!path_is_normalized(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", s); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + + if (streq(name, "StandardInputFile")) { + r = free_and_strdup(&c->stdio_file[STDIN_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + c->std_input = EXEC_INPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=file:%s", s); + + } else if (STR_IN_SET(name, "StandardOutputFile", "StandardOutputFileToAppend")) { + r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + if (streq(name, "StandardOutputFile")) { + c->std_output = EXEC_OUTPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=file:%s", s); + } else { + assert(streq(name, "StandardOutputFileToAppend")); + c->std_output = EXEC_OUTPUT_FILE_APPEND; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=append:%s", s); + } + } else { + assert(STR_IN_SET(name, "StandardErrorFile", "StandardErrorFileToAppend")); + + r = free_and_strdup(&c->stdio_file[STDERR_FILENO], empty_to_null(s)); + if (r < 0) + return r; + + if (streq(name, "StandardErrorFile")) { + c->std_error = EXEC_OUTPUT_FILE; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=file:%s", s); + } else { + assert(streq(name, "StandardErrorFileToAppend")); + c->std_error = EXEC_OUTPUT_FILE_APPEND; + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=append:%s", s); + } + } + } + + return 1; + + } else if (streq(name, "StandardInputData")) { + const void *p; + size_t sz; + + r = sd_bus_message_read_array(message, 'y', &p, &sz); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *encoded = NULL; + + if (sz == 0) { + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + + unit_write_settingf(u, flags, name, "StandardInputData="); + } else { + void *q; + ssize_t n; + + if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) + return -E2BIG; + + n = base64mem(p, sz, &encoded); + if (n < 0) + return (int) n; + + q = realloc(c->stdin_data, c->stdin_data_size + sz); + if (!q) + return -ENOMEM; + + memcpy((uint8_t*) q + c->stdin_data_size, p, sz); + + c->stdin_data = q; + c->stdin_data_size += sz; + + unit_write_settingf(u, flags, name, "StandardInputData=%s", encoded); + } + } + + return 1; + + } else if (streq(name, "Environment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_is_valid(l)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment block."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->environment = strv_free(c->environment); + unit_write_setting(u, flags, name, "Environment="); + } else { + _cleanup_free_ char *joined = NULL; + char **e; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C); + if (!joined) + return -ENOMEM; + + e = strv_env_merge(2, c->environment, l); + if (!e) + return -ENOMEM; + + strv_free_and_replace(c->environment, e); + unit_write_settingf(u, flags, name, "Environment=%s", joined); + } + } + + return 1; + + } else if (streq(name, "UnsetEnvironment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(l)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UnsetEnvironment= list."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->unset_environment = strv_free(c->unset_environment); + unit_write_setting(u, flags, name, "UnsetEnvironment="); + } else { + _cleanup_free_ char *joined = NULL; + char **e; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C); + if (!joined) + return -ENOMEM; + + e = strv_env_merge(2, c->unset_environment, l); + if (!e) + return -ENOMEM; + + strv_free_and_replace(c->unset_environment, e); + unit_write_settingf(u, flags, name, "UnsetEnvironment=%s", joined); + } + } + + return 1; + + } else if (streq(name, "OOMScoreAdjust")) { + int oa; + + r = sd_bus_message_read(message, "i", &oa); + if (r < 0) + return r; + + if (!oom_score_adjust_is_valid(oa)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "OOM score adjust value out of range"); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->oom_score_adjust = oa; + c->oom_score_adjust_set = true; + unit_write_settingf(u, flags, name, "OOMScoreAdjust=%i", oa); + } + + return 1; + + } else if (streq(name, "CoredumpFilter")) { + uint64_t f; + + r = sd_bus_message_read(message, "t", &f); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->coredump_filter = f; + c->coredump_filter_set = true; + unit_write_settingf(u, flags, name, "CoredumpFilter=0x%"PRIx64, f); + } + + return 1; + + } else if (streq(name, "EnvironmentFiles")) { + + _cleanup_free_ char *joined = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **l = NULL; + size_t size = 0; + char **i; + + r = sd_bus_message_enter_container(message, 'a', "(sb)"); + if (r < 0) + return r; + + f = open_memstream_unlocked(&joined, &size); + if (!f) + return -ENOMEM; + + fputs("EnvironmentFile=\n", f); + + STRV_FOREACH(i, c->environment_files) { + _cleanup_free_ char *q = NULL; + + q = specifier_escape(*i); + if (!q) + return -ENOMEM; + + fprintf(f, "EnvironmentFile=%s\n", q); + } + + while ((r = sd_bus_message_enter_container(message, 'r', "sb")) > 0) { + const char *path; + int b; + + r = sd_bus_message_read(message, "sb", &path, &b); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *q = NULL, *buf = NULL; + + buf = strjoin(b ? "-" : "", path); + if (!buf) + return -ENOMEM; + + q = specifier_escape(buf); + if (!q) + return -ENOMEM; + + fprintf(f, "EnvironmentFile=%s\n", q); + + r = strv_consume(&l, TAKE_PTR(buf)); + if (r < 0) + return r; + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->environment_files = strv_free(c->environment_files); + unit_write_setting(u, flags, name, "EnvironmentFile="); + } else { + r = strv_extend_strv(&c->environment_files, l, true); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, joined); + } + } + + return 1; + + } else if (streq(name, "PassEnvironment")) { + + _cleanup_strv_free_ char **l = NULL; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (!strv_env_name_is_valid(l)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PassEnvironment= block."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + c->pass_environment = strv_free(c->pass_environment); + unit_write_setting(u, flags, name, "PassEnvironment="); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&c->pass_environment, l, true); + if (r < 0) + return r; + + /* We write just the new settings out to file, with unresolved specifiers. */ + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "PassEnvironment=%s", joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "ReadWriteDirectories", "ReadOnlyDirectories", "InaccessibleDirectories", + "ReadWritePaths", "ReadOnlyPaths", "InaccessiblePaths")) { + _cleanup_strv_free_ char **l = NULL; + char ***dirs; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + char *i = *p; + size_t offset; + + offset = i[0] == '-'; + offset += i[offset] == '+'; + if (!path_is_absolute(i + offset)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); + + path_simplify(i + offset, false); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (STR_IN_SET(name, "ReadWriteDirectories", "ReadWritePaths")) + dirs = &c->read_write_paths; + else if (STR_IN_SET(name, "ReadOnlyDirectories", "ReadOnlyPaths")) + dirs = &c->read_only_paths; + else /* "InaccessiblePaths" */ + dirs = &c->inaccessible_paths; + + if (strv_isempty(l)) { + *dirs = strv_free(*dirs); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + r = strv_extend_strv(dirs, l, true); + if (r < 0) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "RuntimeDirectory", "StateDirectory", "CacheDirectory", "LogsDirectory", "ConfigurationDirectory")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!path_is_normalized(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is not normalized: %s", name, *p); + + if (path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is absolute: %s", name, *p); + + if (path_startswith(*p, "private")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path can't be 'private': %s", name, *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + ExecDirectoryType i; + ExecDirectory *d; + + assert_se((i = exec_directory_type_from_string(name)) >= 0); + d = c->directories + i; + + if (strv_isempty(l)) { + d->paths = strv_free(d->paths); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&d->paths, l, true); + if (r < 0) + return r; + + joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (STR_IN_SET(name, "AppArmorProfile", "SmackProcessLabel")) { + int ignore; + const char *s; + + r = sd_bus_message_read(message, "(bs)", &ignore, &s); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char **p; + bool *b; + + if (streq(name, "AppArmorProfile")) { + p = &c->apparmor_profile; + b = &c->apparmor_profile_ignore; + } else { /* "SmackProcessLabel" */ + p = &c->smack_process_label; + b = &c->smack_process_label_ignore; + } + + if (isempty(s)) { + *p = mfree(*p); + *b = false; + } else { + if (free_and_strdup(p, s) < 0) + return -ENOMEM; + *b = ignore; + } + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s%s", name, ignore ? "-" : "", strempty(s)); + } + + return 1; + + } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) { + char *source, *destination; + int ignore_enoent; + uint64_t mount_flags; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ssbt)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ssbt)", &source, &destination, &ignore_enoent, &mount_flags)) > 0) { + + if (!path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source); + if (!path_is_absolute(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination); + if (!IN_SET(mount_flags, 0, MS_REC)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mount flags."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = source, + .destination = destination, + .read_only = !!strstr(name, "ReadOnly"), + .recursive = !!(mount_flags & MS_REC), + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s%s:%s:%s", + name, + ignore_enoent ? "-" : "", + source, + destination, + (mount_flags & MS_REC) ? "rbind" : "norbind"); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty) { + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if (streq(name, "TemporaryFileSystem")) { + const char *path, *options; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &path, &options)) > 0) { + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Mount point %s is not absolute.", path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options); + if (r < 0) + return r; + + unit_write_settingf( + u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s:%s", + name, + path, + options); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (empty) { + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + + unit_write_settingf(u, flags, name, "%s=", name); + } + + return 1; + + } else if ((suffix = startswith(name, "Limit"))) { + const char *soft = NULL; + int ri; + + ri = rlimit_from_string(suffix); + if (ri < 0) { + soft = endswith(suffix, "Soft"); + if (soft) { + const char *n; + + n = strndupa(suffix, soft - suffix); + ri = rlimit_from_string(n); + if (ri >= 0) + name = strjoina("Limit", n); + } + } + + if (ri >= 0) { + uint64_t rl; + rlim_t x; + + r = sd_bus_message_read(message, "t", &rl); + if (r < 0) + return r; + + if (rl == (uint64_t) -1) + x = RLIM_INFINITY; + else { + x = (rlim_t) rl; + + if ((uint64_t) x != rl) + return -ERANGE; + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *f = NULL; + struct rlimit nl; + + if (c->rlimit[ri]) { + nl = *c->rlimit[ri]; + + if (soft) + nl.rlim_cur = x; + else + nl.rlim_max = x; + } else + /* When the resource limit is not initialized yet, then assign the value to both fields */ + nl = (struct rlimit) { + .rlim_cur = x, + .rlim_max = x, + }; + + r = rlimit_format(&nl, &f); + if (r < 0) + return r; + + if (c->rlimit[ri]) + *c->rlimit[ri] = nl; + else { + c->rlimit[ri] = newdup(struct rlimit, &nl, 1); + if (!c->rlimit[ri]) + return -ENOMEM; + } + + unit_write_settingf(u, flags, name, "%s=%s", name, f); + } + + return 1; + } + + } else if (streq(name, "MountImages")) { + _cleanup_free_ char *format_str = NULL; + MountImage *mount_images = NULL; + size_t n_mount_images = 0; + char *source, *destination; + int permissive; + + r = sd_bus_message_enter_container(message, 'a', "(ssba(ss))"); + if (r < 0) + return r; + + for (;;) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *source_escaped = NULL, *destination_escaped = NULL; + char *tuple; + + r = sd_bus_message_enter_container(message, 'r', "ssba(ss)"); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssb", &source, &destination, &permissive); + if (r <= 0) + break; + + if (!path_is_absolute(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source); + if (!path_is_normalized(source)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source); + if (!path_is_absolute(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination); + if (!path_is_normalized(destination)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination); + + /* Need to store them in the unit with the escapes, so that they can be parsed again */ + source_escaped = shell_escape(source, ":"); + if (!source_escaped) + return -ENOMEM; + destination_escaped = shell_escape(destination, ":"); + if (!destination_escaped) + return -ENOMEM; + + tuple = strjoin(format_str, + format_str ? " " : "", + permissive ? "-" : "", + source_escaped, + ":", + destination_escaped); + if (!tuple) + return -ENOMEM; + free_and_replace(format_str, tuple); + + r = read_mount_options(message, error, &options, &format_str, ":"); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = mount_image_add(&mount_images, &n_mount_images, + &(MountImage) { + .source = source, + .destination = destination, + .mount_options = options, + .ignore_enoent = permissive, + }); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (n_mount_images == 0) { + c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images); + + unit_write_settingf(u, flags, name, "%s=", name); + } else { + for (size_t i = 0; i < n_mount_images; ++i) { + r = mount_image_add(&c->mount_images, &c->n_mount_images, &mount_images[i]); + if (r < 0) + return r; + } + + unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS, + name, + "%s=%s", + name, + format_str); + } + } + + mount_images = mount_image_free_many(mount_images, &n_mount_images); + + return 1; + } + + return 0; +} diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h new file mode 100644 index 0000000..c538341 --- /dev/null +++ b/src/core/dbus-execute.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "execute.h" + +#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \ + SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \ + SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \ + SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags) + +#define BUS_EXEC_COMMAND_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command, offset, flags) + +#define BUS_EXEC_COMMAND_LIST_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command_list, offset, flags) + +#define BUS_EXEC_EX_COMMAND_LIST_VTABLE(name, offset, flags) \ + SD_BUS_PROPERTY(name, "a(sasasttttuii)", bus_property_get_exec_ex_command_list, offset, flags) + +extern const sd_bus_vtable bus_exec_vtable[]; + +int bus_property_get_exec_output(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_command(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_exec_ex_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); + +int bus_exec_context_set_transient_property(Unit *u, ExecContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_exec_command(Unit *u, const char *name, ExecCommand **exec_command, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c new file mode 100644 index 0000000..1526b31 --- /dev/null +++ b/src/core/dbus-job.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-util.h" +#include "dbus-job.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "job.h" +#include "log.h" +#include "selinux-access.h" +#include "string-util.h" +#include "strv.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, job_type, JobType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_state, job_state, JobState); + +static int property_get_unit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Job *j = userdata; + + assert(bus); + assert(reply); + assert(j); + + p = unit_dbus_path(j->unit); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(so)", j->unit->id, p); +} + +int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Job *j = userdata; + int r; + + assert(message); + assert(j); + + r = mac_selinux_unit_access_check(j->unit, message, "stop", error); + if (r < 0) + return r; + + /* Access is granted to the job owner */ + if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) { + + /* And for everybody else consult polkit */ + r = bus_verify_manage_units_async(j->unit->manager, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + } + + job_finish_and_invalidate(j, JOB_CANCELED, true, false); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ Job **list = NULL; + Job *j = userdata; + int r, i, n; + + if (strstr(sd_bus_message_get_member(message), "After")) + n = job_get_after(j, &list); + else + n = job_get_before(j, &list); + if (n < 0) + return n; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(usssoo)"); + if (r < 0) + return r; + + for (i = 0; i < n; i ++) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + + job_path = job_dbus_path(list[i]); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(list[i]->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(usssoo)", + list[i]->id, + list[i]->unit->id, + job_type_to_string(list[i]->type), + job_state_to_string(list[i]->state), + job_path, + unit_path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +const sd_bus_vtable bus_job_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_METHOD("Cancel", NULL, NULL, bus_job_method_cancel, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetAfter", + NULL,, + "a(usssoo)", + SD_BUS_PARAM(jobs), + bus_job_method_get_waiting_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetBefore", + NULL,, + "a(usssoo)", + SD_BUS_PARAM(jobs), + bus_job_method_get_waiting_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Job, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Unit", "(so)", property_get_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobType", "s", property_get_type, offsetof(Job, type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("State", "s", property_get_state, offsetof(Job, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; + +static int bus_job_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + Job *j; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = manager_get_job_from_dbus_path(m, path, &j); + if (r < 0) + return 0; + + *found = j; + return 1; +} + +static int bus_job_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + unsigned k = 0; + Job *j; + + l = new0(char*, hashmap_size(m->jobs)+1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(j, m->jobs) { + l[k] = job_dbus_path(j); + if (!l[k]) + return -ENOMEM; + + k++; + } + + assert(hashmap_size(m->jobs) == k); + + *nodes = TAKE_PTR(l); + + return k; +} + +const BusObjectImplementation job_object = { + "/org/freedesktop/systemd1/job", + "org.freedesktop.systemd1.Job", + .fallback_vtables = BUS_FALLBACK_VTABLES({bus_job_vtable, bus_job_find}), + .node_enumerator = bus_job_enumerate, +}; + +static int send_new_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Job *j = userdata; + int r; + + assert(bus); + assert(j); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobNew"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "uos", j->id, p, j->unit->id); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + _cleanup_free_ char *p = NULL; + Job *j = userdata; + + assert(bus); + assert(j); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + return sd_bus_emit_properties_changed(bus, p, "org.freedesktop.systemd1.Job", "State", NULL); +} + +void bus_job_send_change_signal(Job *j) { + int r; + + assert(j); + + /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */ + bus_unit_send_pending_change_signal(j->unit, true); + + if (j->in_dbus_queue) { + LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + } + + r = bus_foreach_bus(j->manager, j->bus_track, j->sent_dbus_new_signal ? send_changed_signal : send_new_signal, j); + if (r < 0) + log_debug_errno(r, "Failed to send job change signal for %u: %m", j->id); + + j->sent_dbus_new_signal = true; +} + +void bus_job_send_pending_change_signal(Job *j, bool including_new) { + assert(j); + + if (!j->in_dbus_queue) + return; + + if (!j->sent_dbus_new_signal && !including_new) + return; + + if (MANAGER_IS_RELOADING(j->unit->manager)) + return; + + bus_job_send_change_signal(j); +} + +static int send_removed_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Job *j = userdata; + int r; + + assert(bus); + assert(j); + + p = job_dbus_path(j); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "uoss", j->id, p, j->unit->id, job_result_to_string(j->result)); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +void bus_job_send_removed_signal(Job *j) { + int r; + + assert(j); + + if (!j->sent_dbus_new_signal) + bus_job_send_change_signal(j); + + /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */ + bus_unit_send_pending_change_signal(j->unit, true); + + r = bus_foreach_bus(j->manager, j->bus_track, send_removed_signal, j); + if (r < 0) + log_debug_errno(r, "Failed to send job remove signal for %u: %m", j->id); +} + +static int bus_job_track_handler(sd_bus_track *t, void *userdata) { + Job *j = userdata; + + assert(t); + assert(j); + + j->bus_track = sd_bus_track_unref(j->bus_track); /* make sure we aren't called again */ + + /* Last client dropped off the bus, maybe we should GC this now? */ + job_add_to_gc_queue(j); + return 0; +} + +static int bus_job_allocate_bus_track(Job *j) { + + assert(j); + + if (j->bus_track) + return 0; + + return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j); +} + +int bus_job_coldplug_bus_track(Job *j) { + int r; + _cleanup_strv_free_ char **deserialized_clients = NULL; + + assert(j); + + deserialized_clients = TAKE_PTR(j->deserialized_clients); + + if (strv_isempty(deserialized_clients)) + return 0; + + if (!j->manager->api_bus) + return 0; + + r = bus_job_allocate_bus_track(j); + if (r < 0) + return r; + + return bus_track_add_name_many(j->bus_track, deserialized_clients); +} + +int bus_job_track_sender(Job *j, sd_bus_message *m) { + int r; + + assert(j); + assert(m); + + if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) { + j->ref_by_private_bus = true; + return 0; + } + + r = bus_job_allocate_bus_track(j); + if (r < 0) + return r; + + return sd_bus_track_add_sender(j->bus_track, m); +} diff --git a/src/core/dbus-job.h b/src/core/dbus-job.h new file mode 100644 index 0000000..6f00581 --- /dev/null +++ b/src/core/dbus-job.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "unit.h" +#include "bus-object.h" + +extern const sd_bus_vtable bus_job_vtable[]; +extern const BusObjectImplementation job_object; + +int bus_job_method_cancel(sd_bus_message *message, void *job, sd_bus_error *error); +int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error); + +void bus_job_send_change_signal(Job *j); +void bus_job_send_pending_change_signal(Job *j, bool including_new); +void bus_job_send_removed_signal(Job *j); + +int bus_job_coldplug_bus_track(Job *j); +int bus_job_track_sender(Job *j, sd_bus_message *m); diff --git a/src/core/dbus-kill.c b/src/core/dbus-kill.c new file mode 100644 index 0000000..6333f3b --- /dev/null +++ b/src/core/dbus-kill.c @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-get-properties.h" +#include "dbus-kill.h" +#include "dbus-util.h" +#include "kill.h" +#include "signal-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_kill_mode, kill_mode, KillMode); + +static int property_get_restart_kill_signal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + KillContext *c = userdata; + int s; + + assert(c); + + s = restart_kill_signal(c); + return sd_bus_message_append_basic(reply, 'i', &s); +} + +const sd_bus_vtable bus_kill_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("KillMode", "s", property_get_kill_mode, offsetof(KillContext, kill_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KillSignal", "i", bus_property_get_int, offsetof(KillContext, kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartKillSignal", "i", property_get_restart_kill_signal, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FinalKillSignal", "i", bus_property_get_int, offsetof(KillContext, final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendSIGKILL", "b", bus_property_get_bool, offsetof(KillContext, send_sigkill), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendSIGHUP", "b", bus_property_get_bool, offsetof(KillContext, send_sighup), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogSignal", "i", bus_property_get_int, offsetof(KillContext, watchdog_signal), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static BUS_DEFINE_SET_TRANSIENT_PARSE(kill_mode, KillMode, kill_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(restart_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(final_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(watchdog_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check); + +int bus_kill_context_set_transient_property( + Unit *u, + KillContext *c, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + assert(u); + assert(c); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "KillMode")) + return bus_set_transient_kill_mode(u, name, &c->kill_mode, message, flags, error); + + if (streq(name, "SendSIGHUP")) + return bus_set_transient_bool(u, name, &c->send_sighup, message, flags, error); + + if (streq(name, "SendSIGKILL")) + return bus_set_transient_bool(u, name, &c->send_sigkill, message, flags, error); + + if (streq(name, "KillSignal")) + return bus_set_transient_kill_signal(u, name, &c->kill_signal, message, flags, error); + + if (streq(name, "RestartKillSignal")) + return bus_set_transient_restart_kill_signal(u, name, &c->restart_kill_signal, message, flags, error); + + if (streq(name, "FinalKillSignal")) + return bus_set_transient_final_kill_signal(u, name, &c->final_kill_signal, message, flags, error); + + if (streq(name, "WatchdogSignal")) + return bus_set_transient_watchdog_signal(u, name, &c->watchdog_signal, message, flags, error); + + return 0; +} diff --git a/src/core/dbus-kill.h b/src/core/dbus-kill.h new file mode 100644 index 0000000..5a90287 --- /dev/null +++ b/src/core/dbus-kill.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "kill.h" +#include "unit.h" + +extern const sd_bus_vtable bus_kill_vtable[]; + +int bus_kill_context_set_transient_property(Unit *u, KillContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c new file mode 100644 index 0000000..b37ed7c --- /dev/null +++ b/src/core/dbus-manager.c @@ -0,0 +1,3317 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <sys/prctl.h> +#include <sys/statvfs.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "architecture.h" +#include "build.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-log-control-api.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "install.h" +#include "log.h" +#include "os-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "selinux-access.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "user-util.h" +#include "virt.h" +#include "watchdog.h" + +/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state there, and if + * we can't we'll fail badly. */ +#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024)) + +static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) { + return (runtime ? UNIT_FILE_RUNTIME : 0) | + (force ? UNIT_FILE_FORCE : 0); +} + +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy); + +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", SYSTEMD_FEATURES); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture())); +static BUS_DEFINE_PROPERTY_GET2(property_get_system_state, "s", Manager, manager_state, manager_state_to_string); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_timer_slack_nsec, "t", (uint64_t) prctl(PR_GET_TIMERSLACK)); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "u", Hashmap *, hashmap_size); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_set_size, "u", Set *, set_size); +static BUS_DEFINE_PROPERTY_GET(property_get_default_timeout_abort_usec, "t", Manager, manager_default_timeout_abort_usec); + +static int property_get_virtualization( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + int v; + + assert(bus); + assert(reply); + + v = detect_virtualization(); + + /* Make sure to return the empty string when we detect no virtualization, as that is the API. + * + * https://github.com/systemd/systemd/issues/1423 + */ + + return sd_bus_message_append( + reply, "s", + v == VIRTUALIZATION_NONE ? NULL : virtualization_to_string(v)); +} + +static int property_get_tainted( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *s = NULL; + Manager *m = userdata; + + assert(bus); + assert(reply); + assert(m); + + s = manager_taint_string(m); + if (!s) + return log_oom(); + + return sd_bus_message_append(reply, "s", s); +} + +static int property_set_log_target( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + manager_restore_original_log_target(m); + else { + LogTarget target; + + target = log_target_from_string(t); + if (target < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t); + + manager_override_log_target(m, target); + } + + return 0; +} + +static int property_set_log_level( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + manager_restore_original_log_level(m); + else { + int level; + + level = log_level_from_string(t); + if (level < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t); + + manager_override_log_level(m, level); + } + + return 0; +} + +static int property_get_progress( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + double d; + + assert(bus); + assert(reply); + assert(m); + + if (MANAGER_IS_FINISHED(m)) + d = 1.0; + else + d = 1.0 - ((double) hashmap_size(m->jobs) / (double) m->n_installed_jobs); + + return sd_bus_message_append(reply, "d", d); +} + +static int property_get_environment( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + int r; + + assert(bus); + assert(reply); + assert(m); + + r = manager_get_effective_environment(m, &l); + if (r < 0) + return r; + + return sd_bus_message_append_strv(reply, l); +} + +static int property_get_show_status( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + + assert(m); + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", manager_get_show_status_on(m)); +} + +static int property_get_runtime_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + + assert(m); + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_RUNTIME)); +} + +static int property_get_reboot_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + + assert(m); + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_REBOOT)); +} + +static int property_get_kexec_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = userdata; + + assert(m); + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_KEXEC)); +} + +static int property_set_watchdog(Manager *m, WatchdogType type, sd_bus_message *value) { + usec_t timeout; + int r; + + assert(m); + assert(value); + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + r = sd_bus_message_read(value, "t", &timeout); + if (r < 0) + return r; + + return manager_override_watchdog(m, type, timeout); +} + +static int property_set_runtime_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + return property_set_watchdog(userdata, WATCHDOG_RUNTIME, value); +} + +static int property_set_reboot_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + return property_set_watchdog(userdata, WATCHDOG_REBOOT, value); +} + +static int property_set_kexec_watchdog( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + _unused_ Manager *m = userdata; + + assert(m); + assert(bus); + assert(value); + + return property_set_watchdog(userdata, WATCHDOG_KEXEC, value); +} + +static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) { + Unit *u; + int r; + + assert(m); + assert(message); + assert(ret_unit); + + /* More or less a wrapper around manager_get_unit() that generates nice errors and has one trick up its sleeve: + * if the name is specified empty we use the client's unit. */ + + if (isempty(name)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit."); + } else { + u = manager_get_unit(m, name); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", name); + } + + *ret_unit = u; + return 0; +} + +static int bus_load_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) { + assert(m); + assert(message); + assert(ret_unit); + + /* Pretty much the same as bus_get_unit_by_name(), but we also load the unit if necessary. */ + + if (isempty(name)) + return bus_get_unit_by_name(m, message, name, ret_unit, error); + + return manager_load_unit(m, name, NULL, error, ret_unit); +} + +static int reply_unit_path(Unit *u, sd_bus_message *message, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + int r; + + assert(u); + assert(message); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + path = unit_dbus_path(u); + if (!path) + return log_oom(); + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return reply_unit_path(u, message, error); +} + +static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + pid_t pid; + Unit *u; + int r; + + assert(message); + assert(m); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "u", &pid); + if (r < 0) + return r; + if (pid < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid); + + if (pid == 0) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid); + + return reply_unit_path(u, message, error); +} + +static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + sd_id128_t id; + const void *a; + Unit *u; + size_t sz; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read_array(message, 'y', &a, &sz); + if (r < 0) + return r; + if (sz == 0) + id = SD_ID128_NULL; + else if (sz == 16) + memcpy(&id, a, sz); + else + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID"); + + if (sd_id128_is_null(id)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Client " PID_FMT " not member of any unit.", pid); + } else { + u = hashmap_get(m->units_by_invocation_id, &id); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id)); + } + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + /* So here's a special trick: the bus path we return actually references the unit by its invocation ID instead + * of the unit name. This means it stays valid only as long as the invocation ID stays the same. */ + path = unit_dbus_path_invocation_id(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_get_unit_by_control_group(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *cgroup; + Unit *u; + int r; + + r = sd_bus_message_read(message, "s", &cgroup); + if (r < 0) + return r; + + u = manager_get_unit_by_cgroup(m, cgroup); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Control group '%s' is not valid or not managed by this instance", cgroup); + + return reply_unit_path(u, message, error); +} + +static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_load_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return reply_unit_path(u, message, error); +} + +static int method_start_unit_generic(sd_bus_message *message, Manager *m, JobType job_type, bool reload_if_possible, sd_bus_error *error) { + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_load_unit(m, name, NULL, error, &u); + if (r < 0) + return r; + + return bus_unit_method_start_generic(message, u, job_type, reload_if_possible, error); +} + +static int method_start_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_START, false, error); +} + +static int method_stop_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_STOP, false, error); +} + +static int method_reload_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RELOAD, false, error); +} + +static int method_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RESTART, false, error); +} + +static int method_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, false, error); +} + +static int method_reload_or_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_RESTART, true, error); +} + +static int method_reload_or_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, true, error); +} + +typedef enum GenericUnitOperationFlags { + GENERIC_UNIT_LOAD = 1 << 0, /* Load if the unit is not loaded yet */ + GENERIC_UNIT_VALIDATE_LOADED = 1 << 1, /* Verify unit is properly loaded before forwarding call */ +} GenericUnitOperationFlags; + +static int method_generic_unit_operation( + sd_bus_message *message, + Manager *m, + sd_bus_error *error, + sd_bus_message_handler_t handler, + GenericUnitOperationFlags flags) { + + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Read the first argument from the command and pass the operation to the specified per-unit + * method. */ + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (!isempty(name) && FLAGS_SET(flags, GENERIC_UNIT_LOAD)) + r = manager_load_unit(m, name, NULL, error, &u); + else + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + if (FLAGS_SET(flags, GENERIC_UNIT_VALIDATE_LOADED)) { + r = bus_unit_validate_load_state(u, error); + if (r < 0) + return r; + } + + return handler(message, u, error); +} + +static int method_enqueue_unit_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* We don't bother with GENERIC_UNIT_VALIDATE_LOADED here, as the job logic validates that anyway */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_enqueue_job, GENERIC_UNIT_LOAD); +} + +static int method_start_unit_replace(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *old_name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &old_name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, old_name, &u, error); + if (r < 0) + return r; + if (!u->job || u->job->type != JOB_START) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "No job queued for unit %s", old_name); + + return method_start_unit_generic(message, m, JOB_START, false, error); +} + +static int method_kill_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* We don't bother with GENERIC_UNIT_LOAD nor GENERIC_UNIT_VALIDATE_LOADED here, as it shouldn't + * matter whether a unit is loaded for killing any processes possibly in the unit's cgroup. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_kill, 0); +} + +static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Load the unit if necessary, in order to load it, and insist on the unit being loaded to be + * cleaned */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_clean, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0); +} + +static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0); +} + +static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't load the unit (because unloaded units can't be in failed state), and don't insist on the + * unit to be loaded properly (since a failed unit might have its unit file disappeared) */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_reset_failed, 0); +} + +static int method_set_unit_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only change properties on fully loaded units, and load them in order to set properties */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_set_properties, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Only allow reffing of fully loaded units, and make sure reffing a unit loads it. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_ref, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + +static int method_unref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Dropping a ref OTOH should not require the unit to still be loaded. And since a reffed unit is a + * loaded unit there's no need to load the unit for unreffing it. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_unref, 0); +} + +static int reply_unit_info(sd_bus_message *reply, Unit *u) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + Unit *following; + + following = unit_following(u); + + unit_path = unit_dbus_path(u); + if (!unit_path) + return -ENOMEM; + + if (u->job) { + job_path = job_dbus_path(u->job); + if (!job_path) + return -ENOMEM; + } + + return sd_bus_message_append( + reply, "(ssssssouso)", + u->id, + unit_description(u), + unit_load_state_to_string(u->load_state), + unit_active_state_to_string(unit_active_state(u)), + unit_sub_state_to_string(u), + following ? following->id : "", + unit_path, + u->job ? u->job->id : 0, + u->job ? job_type_to_string(u->job->type) : "", + empty_to_root(job_path)); +} + +static int method_list_units_by_names(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + int r; + char **unit; + _cleanup_strv_free_ char **units = NULL; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &units); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)"); + if (r < 0) + return r; + + STRV_FOREACH(unit, units) { + Unit *u; + + if (!unit_name_is_valid(*unit, UNIT_NAME_ANY)) + continue; + + r = bus_load_unit_by_name(m, message, *unit, &u, error); + if (r < 0) + return r; + + r = reply_unit_info(reply, u); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the + * unit being loaded (because even improperly loaded units might still have processes around */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0); +} + +static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't allow attaching new processes to units that aren't loaded. Don't bother with loading a unit + * for this purpose though, as an unloaded unit is a stopped unit, and we don't allow attaching + * processes to stopped units anyway. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED); +} + +static int transient_unit_from_message( + Manager *m, + sd_bus_message *message, + const char *name, + Unit **unit, + sd_bus_error *error) { + + UnitType t; + Unit *u; + int r; + + assert(m); + assert(message); + assert(name); + + t = unit_name_to_type(name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name or type."); + + if (!unit_vtable[t]->can_transient) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit type %s does not support transient units.", unit_type_to_string(t)); + + r = manager_load_unit(m, name, NULL, error, &u); + if (r < 0) + return r; + + if (!unit_is_pristine(u)) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, "Unit %s already exists.", name); + + /* OK, the unit failed to load and is unreferenced, now let's + * fill in the transient data instead */ + r = unit_make_transient(u); + if (r < 0) + return r; + + /* Set our properties */ + r = bus_unit_set_properties(u, message, UNIT_RUNTIME, false, error); + if (r < 0) + return r; + + /* If the client asked for it, automatically add a reference to this unit. */ + if (u->bus_track_add) { + r = bus_unit_track_add_sender(u, message); + if (r < 0) + return log_error_errno(r, "Failed to watch sender: %m"); + } + + /* Now load the missing bits of the unit we just created */ + unit_add_to_load_queue(u); + manager_dispatch_load_queue(m); + + *unit = u; + + return 0; +} + +static int transient_aux_units_from_message( + Manager *m, + sd_bus_message *message, + sd_bus_error *error) { + + int r; + + assert(m); + assert(message); + + r = sd_bus_message_enter_container(message, 'a', "(sa(sv))"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, 'r', "sa(sv)")) > 0) { + const char *name = NULL; + Unit *u; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = transient_unit_from_message(m, message, name, &u, error); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 0; +} + +static int method_start_transient_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *name, *smode; + Manager *m = userdata; + JobMode mode; + Unit *u; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ss", &name, &smode); + if (r < 0) + return r; + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s is invalid.", smode); + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = transient_unit_from_message(m, message, name, &u, error); + if (r < 0) + return r; + + r = transient_aux_units_from_message(m, message, error); + if (r < 0) + return r; + + /* Finally, start it */ + return bus_unit_queue_job(message, u, JOB_START, mode, 0, error); +} + +static int method_get_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + uint32_t id; + Job *j; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + r = mac_selinux_unit_access_check(j->unit, message, "status", error); + if (r < 0) + return r; + + path = job_dbus_path(j); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + +static int method_cancel_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint32_t id; + Job *j; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + return bus_job_method_cancel(message, j, error); +} + +static int method_clear_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + manager_clear_jobs(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + manager_reset_failed(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + const char *k; + Unit *u; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(u, k, m->units) { + if (k != u->id) + continue; + + if (!strv_isempty(states) && + !strv_contains(states, unit_load_state_to_string(u->load_state)) && + !strv_contains(states, unit_active_state_to_string(unit_active_state(u))) && + !strv_contains(states, unit_sub_state_to_string(u))) + continue; + + if (!strv_isempty(patterns) && + !strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE)) + continue; + + r = reply_unit_info(reply, u); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_units(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return list_units_filtered(message, userdata, error, NULL, NULL); +} + +static int method_list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + return list_units_filtered(message, userdata, error, states, NULL); +} + +static int method_list_units_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return list_units_filtered(message, userdata, error, states, patterns); +} + +static int method_list_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + Job *j; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(usssoo)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(j, m->jobs) { + _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + + job_path = job_dbus_path(j); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(j->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append( + reply, "(usssoo)", + j->id, + j->unit->id, + job_type_to_string(j->type), + job_state_to_string(j->state), + job_path, + unit_path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_subscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + if (sd_bus_message_get_bus(message) == m->api_bus) { + + /* Note that direct bus connection subscribe by + * default, we only track peers on the API bus here */ + + if (!m->subscribed) { + r = sd_bus_track_new(sd_bus_message_get_bus(message), &m->subscribed, NULL, NULL); + if (r < 0) + return r; + } + + r = sd_bus_track_add_sender(m->subscribed, message); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, BUS_ERROR_ALREADY_SUBSCRIBED, "Client is already subscribed."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unsubscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + if (sd_bus_message_get_bus(message) == m->api_bus) { + r = sd_bus_track_remove_sender(m->subscribed, message); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, BUS_ERROR_NOT_SUBSCRIBED, "Client is not subscribed."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int dump_impl(sd_bus_message *message, void *userdata, sd_bus_error *error, int (*reply)(sd_bus_message *, char *)) { + _cleanup_free_ char *dump = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = manager_get_dump_string(m, &dump); + if (r < 0) + return r; + + return reply(message, dump); +} + +static int reply_dump(sd_bus_message *message, char *dump) { + return sd_bus_reply_method_return(message, "s", dump); +} + +static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_impl(message, userdata, error, reply_dump); +} + +static int reply_dump_by_fd(sd_bus_message *message, char *dump) { + _cleanup_close_ int fd = -1; + + fd = acquire_data_fd(dump, strlen(dump), 0); + if (fd < 0) + return fd; + + return sd_bus_reply_method_return(message, "h", fd); +} + +static int method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return dump_impl(message, userdata, error, reply_dump_by_fd); +} + +static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed."); +} + +static int verify_run_space(const char *message, sd_bus_error *error) { + struct statvfs svfs; + uint64_t available; + + if (statvfs("/run/systemd", &svfs) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m"); + + available = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize; + + if (available < RELOAD_DISK_SPACE_MIN) { + char fb_available[FORMAT_BYTES_MAX], fb_need[FORMAT_BYTES_MAX]; + return sd_bus_error_setf(error, + BUS_ERROR_DISK_FULL, + "%s, not enough space available on /run/systemd. " + "Currently, %s are free, but a safety buffer of %s is enforced.", + message, + format_bytes(fb_available, sizeof(fb_available), available), + format_bytes(fb_need, sizeof(fb_need), RELOAD_DISK_SPACE_MIN)); + } + + return 0; +} + +int verify_run_space_and_log(const char *message) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + r = verify_run_space(message, &error); + if (r < 0) + return log_error_errno(r, "%s", bus_error_message(&error, r)); + + return 0; +} + +static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = verify_run_space("Refusing to reload", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_reload_daemon_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* Instead of sending the reply back right away, we just + * remember that we need to and then send it after the reload + * is finished. That way the caller knows when the reload + * finished. */ + + assert(!m->pending_reload_message); + r = sd_bus_message_new_method_return(message, &m->pending_reload_message); + if (r < 0) + return r; + + m->objective = MANAGER_RELOAD; + + return 1; +} + +static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = verify_run_space("Refusing to reexecute", error); + if (r < 0) + return r; + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_reload_daemon_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* We don't send a reply back here, the client should + * just wait for us disconnecting. */ + + m->objective = MANAGER_REEXECUTE; + return 1; +} + +static int method_exit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + /* Exit() (in contrast to SetExitCode()) is actually allowed even if + * we are running on the host. It will fall back on reboot() in + * systemd-shutdown if it cannot do the exit() because it isn't a + * container. */ + + m->objective = MANAGER_EXIT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Reboot is only supported for system managers."); + + m->objective = MANAGER_REBOOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Powering off is only supported for system managers."); + + m->objective = MANAGER_POWEROFF; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "halt", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Halt is only supported for system managers."); + + m->objective = MANAGER_HALT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_kexec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "KExec is only supported for system managers."); + + m->objective = MANAGER_KEXEC; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *ri = NULL, *rt = NULL; + const char *root, *init; + Manager *m = userdata; + struct statvfs svfs; + uint64_t available; + int r; + + assert(message); + assert(m); + + if (statvfs("/run/systemd", &svfs) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m"); + + available = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize; + + if (available < RELOAD_DISK_SPACE_MIN) { + char fb_available[FORMAT_BYTES_MAX], fb_need[FORMAT_BYTES_MAX]; + log_warning("Dangerously low amount of free space on /run/systemd, root switching operation might not complete successfully. " + "Currently, %s are free, but %s are suggested. Proceeding anyway.", + format_bytes(fb_available, sizeof(fb_available), available), + format_bytes(fb_need, sizeof(fb_need), RELOAD_DISK_SPACE_MIN)); + } + + r = mac_selinux_access_check(message, "reboot", error); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Root switching is only supported by system manager."); + + r = sd_bus_message_read(message, "ss", &root, &init); + if (r < 0) + return r; + + if (isempty(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "New root directory may not be the empty string."); + if (!path_is_absolute(root)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "New root path '%s' is not absolute.", root); + if (path_equal(root, "/")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "New root directory cannot be the old root directory."); + + /* Safety check */ + if (isempty(init)) { + r = path_is_os_tree(root); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether root path '%s' contains an OS tree: %m", root); + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Specified switch root path '%s' does not seem to be an OS tree. os-release file is missing.", root); + } else { + _cleanup_free_ char *chased = NULL; + + if (!path_is_absolute(init)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path to init binary '%s' not absolute.", init); + + r = chase_symlinks(init, root, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &chased, NULL); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Could not resolve init executable %s: %m", init); + + if (laccess(chased, X_OK) < 0) { + if (errno == EACCES) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Init binary %s is not executable.", init); + + return sd_bus_error_set_errnof(error, r, "Could not check whether init binary %s is executable: %m", init); + } + } + + rt = strdup(root); + if (!rt) + return -ENOMEM; + + if (!isempty(init)) { + ri = strdup(init); + if (!ri) + return -ENOMEM; + } + + free_and_replace(m->switch_root, rt); + free_and_replace(m->switch_root_init, ri); + + m->objective = MANAGER_SWITCH_ROOT; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **plus = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &plus); + if (r < 0) + return r; + if (!strv_env_is_valid(plus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, NULL, plus); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unset_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **minus = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &minus); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(minus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment variable names or assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, minus, NULL); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_unset_and_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **minus = NULL, **plus = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "reload", error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &minus); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &plus); + if (r < 0) + return r; + + if (!strv_env_name_or_assignment_is_valid(minus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment variable names or assignments"); + if (!strv_env_is_valid(plus)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments"); + + r = bus_verify_set_environment_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = manager_client_environment_modify(m, minus, plus); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_exit_code(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint8_t code; + int r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "exit", error); + if (r < 0) + return r; + + r = sd_bus_message_read_basic(message, 'y', &code); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "ExitCode can only be set for user service managers or in containers."); + + m->return_value = code; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_lookup_dynamic_user_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + uid_t uid; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_basic(message, 's', &name); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Dynamic users are only supported in the system instance."); + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User name invalid: %s", name); + + r = dynamic_user_lookup_name(m, name, &uid); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER, "Dynamic user %s does not exist.", name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "u", (uint32_t) uid); +} + +static int method_lookup_dynamic_user_by_uid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *name = NULL; + Manager *m = userdata; + uid_t uid; + int r; + + assert(message); + assert(m); + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + r = sd_bus_message_read_basic(message, 'u', &uid); + if (r < 0) + return r; + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Dynamic users are only supported in the system instance."); + if (!uid_is_valid(uid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User ID invalid: " UID_FMT, uid); + + r = dynamic_user_lookup_uid(m, uid, &name); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER, "Dynamic user ID " UID_FMT " does not exist.", uid); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", name); +} + +static int method_get_dynamic_users(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + DynamicUser *d; + int r; + + assert(message); + assert(m); + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + + if (!MANAGER_IS_SYSTEM(m)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Dynamic users are only supported in the system instance."); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(us)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(d, m->dynamic_users) { + uid_t uid; + + r = dynamic_user_current(d, &uid); + if (r == -EAGAIN) /* not realized yet? */ + continue; + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Failed to look up a dynamic user."); + + r = sd_bus_message_append(reply, "(us)", uid, d->name); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + UnitFileList *item; + Hashmap *h; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + h = hashmap_new(&string_hash_ops); + if (!h) + return -ENOMEM; + + r = unit_file_get_list(m->unit_file_scope, NULL, h, states, patterns); + if (r < 0) + goto fail; + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + goto fail; + + HASHMAP_FOREACH(item, h) { + + r = sd_bus_message_append(reply, "(ss)", item->path, unit_file_state_to_string(item->state)); + if (r < 0) + goto fail; + } + + unit_file_list_free(h); + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); + +fail: + unit_file_list_free(h); + return r; +} + +static int method_list_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return list_unit_files_by_patterns(message, userdata, error, NULL, NULL); +} + +static int method_list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **states = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = sd_bus_message_read_strv(message, &states); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &patterns); + if (r < 0) + return r; + + return list_unit_files_by_patterns(message, userdata, error, states, patterns); +} + +static int method_get_unit_file_state(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + UnitFileState state; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = unit_file_get_state(m->unit_file_scope, NULL, name, &state); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", unit_file_state_to_string(state)); +} + +static int method_get_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *default_target = NULL; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = mac_selinux_access_check(message, "status", error); + if (r < 0) + return r; + + r = unit_file_get_default(m->unit_file_scope, NULL, &default_target); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", default_target); +} + +static int send_unit_files_changed(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "UnitFilesChanged"); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +/* Create an error reply, using the error information from changes[] + * if possible, and fall back to generating an error from error code c. + * The error message only describes the first error. + * + * Coordinate with unit_file_dump_changes() in install.c. + */ +static int install_error( + sd_bus_error *error, + int c, + UnitFileChange *changes, + size_t n_changes) { + + size_t i; + int r; + + for (i = 0; i < n_changes; i++) + + switch(changes[i].type) { + + case 0 ... INT_MAX: + continue; + + case -EEXIST: + if (changes[i].source) + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "File %s already exists and is a symlink to %s.", + changes[i].path, changes[i].source); + else + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "File %s already exists.", + changes[i].path); + goto found; + + case -ERFKILL: + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, + "Unit file %s is masked.", changes[i].path); + goto found; + + case -EADDRNOTAVAIL: + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED, + "Unit %s is transient or generated.", changes[i].path); + goto found; + + case -EUCLEAN: + r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, + "\"%s\" is not a valid unit name.", + changes[i].path); + goto found; + + case -ELOOP: + r = sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED, + "Refusing to operate on alias name or linked unit file: %s", + changes[i].path); + goto found; + + case -ENOENT: + r = sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, + "Unit file %s does not exist.", changes[i].path); + goto found; + + default: + r = sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path); + goto found; + } + + r = c < 0 ? c : -EINVAL; + + found: + unit_file_changes_free(changes, n_changes); + return r; +} + +static int reply_unit_file_changes_and_free( + Manager *m, + sd_bus_message *message, + int carries_install_info, + UnitFileChange *changes, + size_t n_changes, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + bool bad = false, good = false; + size_t i; + int r; + + if (unit_file_changes_have_modification(changes, n_changes)) { + r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m"); + } + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + goto fail; + + if (carries_install_info >= 0) { + r = sd_bus_message_append(reply, "b", carries_install_info); + if (r < 0) + goto fail; + } + + r = sd_bus_message_open_container(reply, 'a', "(sss)"); + if (r < 0) + goto fail; + + for (i = 0; i < n_changes; i++) { + + if (changes[i].type < 0) { + bad = true; + continue; + } + + r = sd_bus_message_append( + reply, "(sss)", + unit_file_change_type_to_string(changes[i].type), + changes[i].path, + changes[i].source); + if (r < 0) + goto fail; + + good = true; + } + + /* If there was a failed change, and no successful change, then return the first failure as proper method call + * error. */ + if (bad && !good) + return install_error(error, 0, changes, n_changes); + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto fail; + + unit_file_changes_free(changes, n_changes); + return sd_bus_send(NULL, reply, NULL); + +fail: + unit_file_changes_free(changes, n_changes); + return r; +} + +static int method_enable_unit_files_generic( + sd_bus_message *message, + Manager *m, + int (*call)(UnitFileScope scope, UnitFileFlags flags, const char *root_dir, char *files[], UnitFileChange **changes, size_t *n_changes), + bool carries_install_info, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + UnitFileFlags flags; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "EnableUnitFilesWithFlags")) { + uint64_t raw_flags; + + r = sd_bus_message_read(message, "t", &raw_flags); + if (r < 0) + return r; + if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0) + return -EINVAL; + flags = raw_flags; + } else { + int runtime, force; + + r = sd_bus_message_read(message, "bb", &runtime, &force); + if (r < 0) + return r; + flags = unit_file_bools_to_flags(runtime, force); + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = call(m->unit_file_scope, flags, NULL, l, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error); +} + +static int method_enable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_enable, true, error); +} + +static int method_enable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_enable, true, error); +} + +static int method_reenable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_reenable, true, error); +} + +static int method_link_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_link, false, error); +} + +static int unit_file_preset_without_mode(UnitFileScope scope, UnitFileFlags flags, const char *root_dir, char **files, UnitFileChange **changes, size_t *n_changes) { + return unit_file_preset(scope, flags, root_dir, files, UNIT_FILE_PRESET_FULL, changes, n_changes); +} + +static int method_preset_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_preset_without_mode, true, error); +} + +static int method_mask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_enable_unit_files_generic(message, userdata, unit_file_mask, false, error); +} + +static int method_preset_unit_files_with_mode(sd_bus_message *message, void *userdata, sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + UnitFilePresetMode mm; + int runtime, force, r; + UnitFileFlags flags; + const char *mode; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + if (isempty(mode)) + mm = UNIT_FILE_PRESET_FULL; + else { + mm = unit_file_preset_mode_from_string(mode); + if (mm < 0) + return -EINVAL; + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_preset(m->unit_file_scope, flags, NULL, l, mm, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, r, changes, n_changes, error); +} + +static int method_disable_unit_files_generic( + sd_bus_message *message, + Manager *m, + int (*call)(UnitFileScope scope, UnitFileFlags flags, const char *root_dir, char *files[], UnitFileChange **changes, size_t *n_changes), + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + UnitFileFlags flags; + size_t n_changes = 0; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlags")) { + uint64_t raw_flags; + + r = sd_bus_message_read(message, "t", &raw_flags); + if (r < 0) + return r; + if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0 || + FLAGS_SET(raw_flags, UNIT_FILE_FORCE)) + return -EINVAL; + flags = raw_flags; + } else { + int runtime; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + flags = unit_file_bools_to_flags(runtime, false); + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = call(m->unit_file_scope, flags, NULL, l, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_disable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_disable, error); +} + +static int method_disable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_disable, error); +} + +static int method_unmask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_disable_unit_files_generic(message, userdata, unit_file_unmask, error); +} + +static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_revert(m->unit_file_scope, NULL, l, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_set_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) { + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + const char *name; + int force, r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "enable", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sb", &name, &force); + if (r < 0) + return r; + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_set_default(m->unit_file_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + UnitFileChange *changes = NULL; + size_t n_changes = 0; + Manager *m = userdata; + UnitFilePresetMode mm; + const char *mode; + UnitFileFlags flags; + int force, runtime, r; + + assert(message); + assert(m); + + r = mac_selinux_access_check(message, "enable", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + if (isempty(mode)) + mm = UNIT_FILE_PRESET_FULL; + else { + mm = unit_file_preset_mode_from_string(mode); + if (mm < 0) + return -EINVAL; + } + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_file_preset_all(m->unit_file_scope, flags, NULL, mm, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_add_dependency_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + UnitFileChange *changes = NULL; + size_t n_changes = 0; + int runtime, force, r; + char *target, *type; + UnitDependency dep; + UnitFileFlags flags; + + assert(message); + assert(m); + + r = bus_verify_manage_unit_files_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "ssbb", &target, &type, &runtime, &force); + if (r < 0) + return r; + + flags = unit_file_bools_to_flags(runtime, force); + + dep = unit_dependency_from_string(type); + if (dep < 0) + return -EINVAL; + + r = unit_file_add_dependency(m->unit_file_scope, flags, NULL, l, target, dep, &changes, &n_changes); + if (r < 0) + return install_error(error, r, changes, n_changes); + + return reply_unit_file_changes_and_free(m, message, -1, changes, n_changes, error); +} + +static int method_get_unit_file_links(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + UnitFileChange *changes = NULL; + size_t n_changes = 0, i; + UnitFileFlags flags; + const char *name; + char **p; + int runtime, r; + + r = sd_bus_message_read(message, "sb", &name, &runtime); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, SD_BUS_TYPE_ARRAY, "s"); + if (r < 0) + return r; + + p = STRV_MAKE(name); + flags = UNIT_FILE_DRY_RUN | + (runtime ? UNIT_FILE_RUNTIME : 0); + + r = unit_file_disable(UNIT_FILE_SYSTEM, flags, NULL, p, &changes, &n_changes); + if (r < 0) + return log_error_errno(r, "Failed to get file links for %s: %m", name); + + for (i = 0; i < n_changes; i++) + if (changes[i].type == UNIT_FILE_UNLINK) { + r = sd_bus_message_append(reply, "s", changes[i].path); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_job_waiting(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint32_t id; + Job *j; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "u", &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id); + + return bus_job_method_get_waiting_jobs(message, j, error); +} + +static int method_abandon_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + if (u->type != UNIT_SCOPE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit '%s' is not a scope unit, refusing.", name); + + return bus_scope_method_abandon(message, u, error); +} + +static int method_set_show_status(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + ShowStatus mode = _SHOW_STATUS_INVALID; + const char *t; + int r; + + assert(m); + assert(message); + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + + if (!isempty(t)) { + mode = show_status_from_string(t); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid show status '%s'", t); + } + + manager_override_show_status(m, mode, "bus"); + + return sd_bus_reply_method_return(message, NULL); +} + +const sd_bus_vtable bus_manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Version", "s", property_get_version, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Features", "s", property_get_features, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Virtualization", "s", property_get_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Architecture", "s", property_get_architecture, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Tainted", "s", property_get_tainted, 0, SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("FirmwareTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FIRMWARE]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("LoaderTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_LOADER]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("KernelTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_KERNEL]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0), + SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0), + SD_BUS_PROPERTY("NNames", "u", property_get_hashmap_size, offsetof(Manager, units), 0), + SD_BUS_PROPERTY("NFailedUnits", "u", property_get_set_size, offsetof(Manager, failed_units), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0), + SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0), + SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0), + SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0), + SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0), + SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ShowStatus", "b", property_get_show_status, 0, 0), + SD_BUS_PROPERTY("UnitPath", "as", NULL, offsetof(Manager, lookup_paths.search_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStandardOutput", "s", bus_property_get_exec_output, offsetof(Manager, default_std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStandardError", "s", bus_property_get_exec_output, offsetof(Manager, default_std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogUSec", "t", property_get_runtime_watchdog, property_set_runtime_watchdog, 0, 0), + SD_BUS_WRITABLE_PROPERTY("RebootWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, 0), + /* The following item is an obsolete alias */ + SD_BUS_WRITABLE_PROPERTY("ShutdownWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_WRITABLE_PROPERTY("KExecWatchdogUSec", "t", property_get_kexec_watchdog, property_set_kexec_watchdog, 0, 0), + SD_BUS_WRITABLE_PROPERTY("ServiceWatchdogs", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, service_watchdogs), 0), + SD_BUS_PROPERTY("ControlGroup", "s", NULL, offsetof(Manager, cgroup_root), 0), + SD_BUS_PROPERTY("SystemState", "s", property_get_system_state, 0, 0), + SD_BUS_PROPERTY("ExitCode", "y", bus_property_get_unsigned, offsetof(Manager, return_value), 0), + SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, default_timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTimeoutAbortUSec", "t", property_get_default_timeout_abort_usec, 0, 0), + SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, default_restart_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST), + /* The following two items are obsolete alias */ + SD_BUS_PROPERTY("DefaultStartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("DefaultStartLimitInterval", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("DefaultStartLimitBurst", "u", bus_property_get_unsigned, offsetof(Manager, default_start_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultCPUAccounting", "b", bus_property_get_bool, offsetof(Manager, default_cpu_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultBlockIOAccounting", "b", bus_property_get_bool, offsetof(Manager, default_blockio_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultMemoryAccounting", "b", bus_property_get_bool, offsetof(Manager, default_memory_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTasksAccounting", "b", bus_property_get_bool, offsetof(Manager, default_tasks_accounting), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCPU", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCPUSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitFSIZE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitDATA", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitDATASoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSTACK", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCORE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitCORESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRSS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRSSSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNOFILE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitAS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitASSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNPROC", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitLOCKS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNICE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitNICESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTPRIO", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, default_tasks_max), 0), + SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), + + SD_BUS_METHOD_WITH_NAMES("GetUnit", + "s", + SD_BUS_PARAM(name), + "o", + SD_BUS_PARAM(unit), + method_get_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetUnitByPID", + "u", + SD_BUS_PARAM(pid), + "o", + SD_BUS_PARAM(unit), + method_get_unit_by_pid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetUnitByInvocationID", + "ay", + SD_BUS_PARAM(invocation_id), + "o", + SD_BUS_PARAM(unit), + method_get_unit_by_invocation_id, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetUnitByControlGroup", + "s", + SD_BUS_PARAM(cgroup), + "o", + SD_BUS_PARAM(unit), + method_get_unit_by_control_group, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("LoadUnit", + "s", + SD_BUS_PARAM(name), + "o", + SD_BUS_PARAM(unit), + method_load_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("StartUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_start_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("StartUnitReplace", + "sss", + SD_BUS_PARAM(old_unit) + SD_BUS_PARAM(new_unit) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_start_unit_replace, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("StopUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_stop_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ReloadUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_reload_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("RestartUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("TryRestartUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_try_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ReloadOrRestartUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_reload_or_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ReloadOrTryRestartUnit", + "ss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_reload_or_try_restart_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("EnqueueUnitJob", + "sss", + SD_BUS_PARAM(name) + SD_BUS_PARAM(job_type) + SD_BUS_PARAM(job_mode), + "uososa(uosos)", + SD_BUS_PARAM(job_id) + SD_BUS_PARAM(job_path) + SD_BUS_PARAM(unit_id) + SD_BUS_PARAM(unit_path) + SD_BUS_PARAM(job_type) + SD_BUS_PARAM(affected_jobs), + method_enqueue_unit_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("KillUnit", + "ssi", + SD_BUS_PARAM(name) + SD_BUS_PARAM(whom) + SD_BUS_PARAM(signal), + NULL,, + method_kill_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("CleanUnit", + "sas", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mask), + NULL,, + method_clean_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("FreezeUnit", + "s", + SD_BUS_PARAM(name), + NULL,, + method_freeze_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ThawUnit", + "s", + SD_BUS_PARAM(name), + NULL,, + method_thaw_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ResetFailedUnit", + "s", + SD_BUS_PARAM(name), + NULL,, + method_reset_failed_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("SetUnitProperties", + "sba(sv)", + SD_BUS_PARAM(name) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(properties), + NULL,, + method_set_unit_properties, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("RefUnit", + "s", + SD_BUS_PARAM(name), + NULL,, + method_ref_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("UnrefUnit", + "s", + SD_BUS_PARAM(name), + NULL,, + method_unref_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("StartTransientUnit", + "ssa(sv)a(sa(sv))", + SD_BUS_PARAM(name) + SD_BUS_PARAM(mode) + SD_BUS_PARAM(properties) + SD_BUS_PARAM(aux), + "o", + SD_BUS_PARAM(job), + method_start_transient_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetUnitProcesses", + "s", + SD_BUS_PARAM(name), + "a(sus)", + SD_BUS_PARAM(processes), + method_get_unit_processes, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("AttachProcessesToUnit", + "ssau", + SD_BUS_PARAM(unit_name) + SD_BUS_PARAM(subcgroup) + SD_BUS_PARAM(pids), + NULL,, + method_attach_processes_to_unit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("AbandonScope", + "s", + SD_BUS_PARAM(name), + NULL,, + method_abandon_scope, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetJob", + "u", + SD_BUS_PARAM(id), + "o", + SD_BUS_PARAM(job), + method_get_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetJobAfter", + "u", + SD_BUS_PARAM(id), + "a(usssoo)", + SD_BUS_PARAM(jobs), + method_get_job_waiting, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetJobBefore", + "u", + SD_BUS_PARAM(id), + "a(usssoo)", + SD_BUS_PARAM(jobs), + method_get_job_waiting, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("CancelJob", + "u", + SD_BUS_PARAM(id), + NULL,, + method_cancel_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ClearJobs", + NULL, + NULL, + method_clear_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailed", + NULL, + NULL, + method_reset_failed, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("SetShowStatus", + "s", + SD_BUS_PARAM(mode), + NULL,, + method_set_show_status, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_ADMIN)), + SD_BUS_METHOD_WITH_NAMES("ListUnits", + NULL,, + "a(ssssssouso)", + SD_BUS_PARAM(units), + method_list_units, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ListUnitsFiltered", + "as", + SD_BUS_PARAM(states), + "a(ssssssouso)", + SD_BUS_PARAM(units), + method_list_units_filtered, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ListUnitsByPatterns", + "asas", + SD_BUS_PARAM(states) + SD_BUS_PARAM(patterns), + "a(ssssssouso)", + SD_BUS_PARAM(units), + method_list_units_by_patterns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ListUnitsByNames", + "as", + SD_BUS_PARAM(names), + "a(ssssssouso)", + SD_BUS_PARAM(units), + method_list_units_by_names, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ListJobs", + NULL,, + "a(usssoo)", + SD_BUS_PARAM(jobs), + method_list_jobs, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Subscribe", + NULL, + NULL, + method_subscribe, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unsubscribe", + NULL, + NULL, + method_unsubscribe, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("Dump", + NULL,, + "s", + SD_BUS_PARAM(output), + method_dump, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("DumpByFileDescriptor", + NULL,, + "h", + SD_BUS_PARAM(fd), + method_dump_by_fd, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("CreateSnapshot", + "sb", + SD_BUS_PARAM(name) + SD_BUS_PARAM(cleanup), + "o", + SD_BUS_PARAM(unit), + method_refuse_snapshot, + SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_METHOD_WITH_NAMES("RemoveSnapshot", + "s", + SD_BUS_PARAM(name), + NULL,, + method_refuse_snapshot, + SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_METHOD("Reload", + NULL, + NULL, + method_reload, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Reexecute", + NULL, + NULL, + method_reexecute, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Exit", + NULL, + NULL, + method_exit, + 0), + SD_BUS_METHOD("Reboot", + NULL, + NULL, + method_reboot, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("PowerOff", + NULL, + NULL, + method_poweroff, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("Halt", + NULL, + NULL, + method_halt, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD("KExec", + NULL, + NULL, + method_kexec, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD_WITH_NAMES("SwitchRoot", + "ss", + SD_BUS_PARAM(new_root) + SD_BUS_PARAM(init), + NULL,, + method_switch_root, + SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)), + SD_BUS_METHOD_WITH_NAMES("SetEnvironment", + "as", + SD_BUS_PARAM(assignments), + NULL,, + method_set_environment, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("UnsetEnvironment", + "as", + SD_BUS_PARAM(names), + NULL,, + method_unset_environment, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("UnsetAndSetEnvironment", + "asas", + SD_BUS_PARAM(names) + SD_BUS_PARAM(assignments), + NULL,, + method_unset_and_set_environment, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ListUnitFiles", + NULL,, + "a(ss)", + SD_BUS_PARAM(unit_files), + method_list_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ListUnitFilesByPatterns", + "asas", + SD_BUS_PARAM(states) + SD_BUS_PARAM(patterns), + "a(ss)", + SD_BUS_PARAM(unit_files), + method_list_unit_files_by_patterns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetUnitFileState", + "s", + SD_BUS_PARAM(file), + "s", + SD_BUS_PARAM(state), + method_get_unit_file_state, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("EnableUnitFiles", + "asbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "ba(sss)", + SD_BUS_PARAM(carries_install_info) + SD_BUS_PARAM(changes), + method_enable_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("DisableUnitFiles", + "asb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime), + "a(sss)", + SD_BUS_PARAM(changes), + method_disable_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("EnableUnitFilesWithFlags", + "ast", + SD_BUS_PARAM(files) + SD_BUS_PARAM(flags), + "ba(sss)", + SD_BUS_PARAM(carries_install_info) + SD_BUS_PARAM(changes), + method_enable_unit_files_with_flags, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("DisableUnitFilesWithFlags", + "ast", + SD_BUS_PARAM(files) + SD_BUS_PARAM(flags), + "a(sss)", + SD_BUS_PARAM(changes), + method_disable_unit_files_with_flags, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ReenableUnitFiles", + "asbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "ba(sss)", + SD_BUS_PARAM(carries_install_info) + SD_BUS_PARAM(changes), + method_reenable_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("LinkUnitFiles", + "asbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "a(sss)", + SD_BUS_PARAM(changes), + method_link_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("PresetUnitFiles", + "asbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "ba(sss)", + SD_BUS_PARAM(carries_install_info) + SD_BUS_PARAM(changes), + method_preset_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("PresetUnitFilesWithMode", + "assbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(mode) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "ba(sss)", + SD_BUS_PARAM(carries_install_info) + SD_BUS_PARAM(changes), + method_preset_unit_files_with_mode, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("MaskUnitFiles", + "asbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "a(sss)", + SD_BUS_PARAM(changes), + method_mask_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("UnmaskUnitFiles", + "asb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(runtime), + "a(sss)", + SD_BUS_PARAM(changes), + method_unmask_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("RevertUnitFiles", + "as", + SD_BUS_PARAM(files), + "a(sss)", + SD_BUS_PARAM(changes), + method_revert_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("SetDefaultTarget", + "sb", + SD_BUS_PARAM(name) + SD_BUS_PARAM(force), + "a(sss)", + SD_BUS_PARAM(changes), + method_set_default_target, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetDefaultTarget", + NULL,, + "s", + SD_BUS_PARAM(name), + method_get_default_target, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("PresetAllUnitFiles", + "sbb", + SD_BUS_PARAM(mode) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "a(sss)", + SD_BUS_PARAM(changes), + method_preset_all_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("AddDependencyUnitFiles", + "asssbb", + SD_BUS_PARAM(files) + SD_BUS_PARAM(target) + SD_BUS_PARAM(type) + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(force), + "a(sss)", + SD_BUS_PARAM(changes), + method_add_dependency_unit_files, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetUnitFileLinks", + "sb", + SD_BUS_PARAM(name) + SD_BUS_PARAM(runtime), + "as", + SD_BUS_PARAM(links), + method_get_unit_file_links, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("SetExitCode", + "y", + SD_BUS_PARAM(number), + NULL,, + method_set_exit_code, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("LookupDynamicUserByName", + "s", + SD_BUS_PARAM(name), + "u", + SD_BUS_PARAM(uid), + method_lookup_dynamic_user_by_name, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("LookupDynamicUserByUID", + "u", + SD_BUS_PARAM(uid), + "s", + SD_BUS_PARAM(name), + method_lookup_dynamic_user_by_uid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("GetDynamicUsers", + NULL,, + "a(us)", + SD_BUS_PARAM(users), + method_get_dynamic_users, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_SIGNAL_WITH_NAMES("UnitNew", + "so", + SD_BUS_PARAM(id) + SD_BUS_PARAM(unit), + 0), + SD_BUS_SIGNAL_WITH_NAMES("UnitRemoved", + "so", + SD_BUS_PARAM(id) + SD_BUS_PARAM(unit), + 0), + SD_BUS_SIGNAL_WITH_NAMES("JobNew", + "uos", + SD_BUS_PARAM(id) + SD_BUS_PARAM(job) + SD_BUS_PARAM(unit), + 0), + SD_BUS_SIGNAL_WITH_NAMES("JobRemoved", + "uoss", + SD_BUS_PARAM(id) + SD_BUS_PARAM(job) + SD_BUS_PARAM(unit) + SD_BUS_PARAM(result), + 0), + SD_BUS_SIGNAL_WITH_NAMES("StartupFinished", + "tttttt", + SD_BUS_PARAM(firmware) + SD_BUS_PARAM(loader) + SD_BUS_PARAM(kernel) + SD_BUS_PARAM(initrd) + SD_BUS_PARAM(userspace) + SD_BUS_PARAM(total), + 0), + SD_BUS_SIGNAL("UnitFilesChanged", NULL, 0), + SD_BUS_SIGNAL_WITH_NAMES("Reloading", + "b", + SD_BUS_PARAM(active), + 0), + + SD_BUS_VTABLE_END +}; + +const sd_bus_vtable bus_manager_log_control_vtable[] = { + SD_BUS_VTABLE_START(0), + + /* We define a private version of this interface here, since we want slightly different + * implementations for the setters. We'll still use the generic getters however, and we share the + * setters with the implementations for the Manager interface above (which pre-dates the generic + * service API interface). */ + + SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0), + SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0), + SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0), + + SD_BUS_VTABLE_END, +}; + +static int send_finished(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + usec_t *times = userdata; + int r; + + assert(bus); + assert(times); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "StartupFinished"); + if (r < 0) + return r; + + r = sd_bus_message_append(message, "tttttt", times[0], times[1], times[2], times[3], times[4], times[5]); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +void bus_manager_send_finished( + Manager *m, + usec_t firmware_usec, + usec_t loader_usec, + usec_t kernel_usec, + usec_t initrd_usec, + usec_t userspace_usec, + usec_t total_usec) { + + int r; + + assert(m); + + r = bus_foreach_bus( + m, + NULL, + send_finished, + (usec_t[6]) { + firmware_usec, + loader_usec, + kernel_usec, + initrd_usec, + userspace_usec, + total_usec + }); + if (r < 0) + log_debug_errno(r, "Failed to send finished signal: %m"); +} + +static int send_reloading(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL; + int r; + + assert(bus); + + r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "Reloading"); + if (r < 0) + return r; + + r = sd_bus_message_append(message, "b", PTR_TO_INT(userdata)); + if (r < 0) + return r; + + return sd_bus_send(bus, message, NULL); +} + +void bus_manager_send_reloading(Manager *m, bool active) { + int r; + + assert(m); + + r = bus_foreach_bus(m, NULL, send_reloading, INT_TO_PTR(active)); + if (r < 0) + log_debug_errno(r, "Failed to send reloading signal: %m"); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + assert(bus); + + return sd_bus_emit_properties_changed_strv(bus, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + NULL); +} + +void bus_manager_send_change_signal(Manager *m) { + int r; + + assert(m); + + r = bus_foreach_bus(m, NULL, send_changed_signal, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send manager change signal: %m"); +} diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h new file mode 100644 index 0000000..f3862fc --- /dev/null +++ b/src/core/dbus-manager.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus-vtable.h" + +#include "manager.h" + +extern const sd_bus_vtable bus_manager_vtable[]; +extern const sd_bus_vtable bus_manager_log_control_vtable[]; + +void bus_manager_send_finished(Manager *m, usec_t firmware_usec, usec_t loader_usec, usec_t kernel_usec, usec_t initrd_usec, usec_t userspace_usec, usec_t total_usec); +void bus_manager_send_reloading(Manager *m, bool active); +void bus_manager_send_change_signal(Manager *m); + +int verify_run_space_and_log(const char *message); + +int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c new file mode 100644 index 0000000..73702b1 --- /dev/null +++ b/src/core/dbus-mount.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-mount.h" +#include "dbus-util.h" +#include "mount.h" +#include "string-util.h" +#include "unit.h" + +static const char *mount_get_what(const Mount *m) { + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.what) + return m->parameters_proc_self_mountinfo.what; + if (m->from_fragment && m->parameters_fragment.what) + return m->parameters_fragment.what; + return NULL; +} + +static const char *mount_get_options(const Mount *m) { + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.options) + return m->parameters_proc_self_mountinfo.options; + if (m->from_fragment && m->parameters_fragment.options) + return m->parameters_fragment.options; + return NULL; +} + +static const char *mount_get_fstype(const Mount *m) { + if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.fstype) + return m->parameters_proc_self_mountinfo.fstype; + else if (m->from_fragment && m->parameters_fragment.fstype) + return m->parameters_fragment.fstype; + return NULL; +} + +static BUS_DEFINE_PROPERTY_GET(property_get_what, "s", Mount, mount_get_what); +static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Mount, mount_get_options); +static BUS_DEFINE_PROPERTY_GET(property_get_type, "s", Mount, mount_get_fstype); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, mount_result, MountResult); + +const sd_bus_vtable bus_mount_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LazyUnmount", "b", bus_property_get_bool, offsetof(Mount, lazy_unmount), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ForceUnmount", "b", bus_property_get_bool, offsetof(Mount, force_unmount), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReadWriteOnly", "b", bus_property_get_bool, offsetof(Mount, read_write_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static int bus_mount_set_transient_property( + Mount *m, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(m); + + assert(m); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Where")) + return bus_set_transient_path(u, name, &m->where, message, flags, error); + + if (streq(name, "What")) + return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error); + + if (streq(name, "Options")) + return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error); + + if (streq(name, "Type")) + return bus_set_transient_string(u, name, &m->parameters_fragment.fstype, message, flags, error); + + if (streq(name, "TimeoutUSec")) + return bus_set_transient_usec_fix_0(u, name, &m->timeout_usec, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &m->directory_mode, message, flags, error); + + if (streq(name, "SloppyOptions")) + return bus_set_transient_bool(u, name, &m->sloppy_options, message, flags, error); + + if (streq(name, "LazyUnmount")) + return bus_set_transient_bool(u, name, &m->lazy_unmount, message, flags, error); + + if (streq(name, "ForceUnmount")) + return bus_set_transient_bool(u, name, &m->force_unmount, message, flags, error); + + if (streq(name, "ReadWriteOnly")) + return bus_set_transient_bool(u, name, &m->read_write_only, message, flags, error); + + return 0; +} + +int bus_mount_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &m->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_mount_set_transient_property(m, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &m->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &m->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_mount_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-mount.h b/src/core/dbus-mount.h new file mode 100644 index 0000000..5a848d3 --- /dev/null +++ b/src/core/dbus-mount.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_mount_vtable[]; + +int bus_mount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_mount_commit_properties(Unit *u); diff --git a/src/core/dbus-path.c b/src/core/dbus-path.c new file mode 100644 index 0000000..14e77d7 --- /dev/null +++ b/src/core/dbus-path.c @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "dbus-path.h" +#include "dbus-util.h" +#include "list.h" +#include "path.h" +#include "path-util.h" +#include "string-util.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, path_result, PathResult); + +static int property_get_paths( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Path *p = userdata; + PathSpec *k; + int r; + + assert(bus); + assert(reply); + assert(p); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(spec, k, p->specs) { + r = sd_bus_message_append(reply, "(ss)", path_type_to_string(k->type), k->path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_path_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Paths", "a(ss)", property_get_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MakeDirectory", "b", bus_property_get_bool, offsetof(Path, make_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Path, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Path, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_VTABLE_END +}; + +static int bus_path_set_transient_property( + Path *p, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(p); + int r; + + assert(p); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "MakeDirectory")) + return bus_set_transient_bool(u, name, &p->make_directory, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &p->directory_mode, message, flags, error); + + if (streq(name, "Paths")) { + const char *type_name, *path; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &type_name, &path)) > 0) { + PathType t; + + t = path_type_from_string(type_name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown path type: %s", type_name); + + if (isempty(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is empty", type_name); + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is not absolute: %s", type_name, path); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *k; + PathSpec *s; + + k = strdup(path); + if (!k) + return -ENOMEM; + + path_simplify(k, false); + + s = new0(PathSpec, 1); + if (!s) + return -ENOMEM; + + s->unit = u; + s->path = TAKE_PTR(k); + s->type = t; + s->inotify_fd = -1; + + LIST_PREPEND(spec, p->specs, s); + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", type_name, path); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + path_free_specs(p); + unit_write_settingf(u, flags, name, "PathExists="); + } + + return 1; + } + + return 0; +} + +int bus_path_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags mode, + sd_bus_error *error) { + + Path *p = PATH(u); + + assert(p); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) + return bus_path_set_transient_property(p, name, message, mode, error); + + return 0; +} diff --git a/src/core/dbus-path.h b/src/core/dbus-path.h new file mode 100644 index 0000000..b5018b0 --- /dev/null +++ b/src/core/dbus-path.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_path_vtable[]; + +int bus_path_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c new file mode 100644 index 0000000..1bcb483 --- /dev/null +++ b/src/core/dbus-scope.c @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-kill.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "scope.h" +#include "selinux-access.h" +#include "unit.h" + +int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Scope *s = userdata; + int r; + + assert(message); + assert(s); + + r = mac_selinux_unit_access_check(UNIT(s), message, "stop", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(UNIT(s)->manager, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = scope_abandon(s); + if (r == -ESTALE) + return sd_bus_error_setf(error, BUS_ERROR_SCOPE_NOT_RUNNING, "Scope %s is not running, cannot abandon.", UNIT(s)->id); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult); + +const sd_bus_vtable bus_scope_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Controller", "s", NULL, offsetof(Scope, controller), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_SIGNAL("RequestStop", NULL, 0), + SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_method_abandon, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; + +static int bus_scope_set_transient_property( + Scope *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(s); + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "TimeoutStopUSec")) + return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error); + + if (streq(name, "RuntimeMaxUSec")) + return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error); + + if (streq(name, "PIDs")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + unsigned n = 0; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + + for (;;) { + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + if (!creds) { + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + } + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + r = unit_pid_attachable(u, pid, error); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_watch_pid(u, pid, false); + if (r < 0 && r != -EEXIST) + return r; + } + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (n <= 0) + return -EINVAL; + + return 1; + + } else if (streq(name, "Controller")) { + const char *controller; + + /* We can't support direct connections with this, as direct connections know no service or unique name + * concept, but the Controller field stores exactly that. */ + if (sd_bus_message_get_bus(message) != u->manager->api_bus) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Sorry, Controller= logic only supported via the bus."); + + r = sd_bus_message_read(message, "s", &controller); + if (r < 0) + return r; + + if (!isempty(controller) && !sd_bus_service_name_is_valid(controller)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Controller '%s' is not a valid bus name.", controller); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = free_and_strdup(&s->controller, empty_to_null(controller)); + if (r < 0) + return r; + } + + return 1; + } + + return 0; +} + +int bus_scope_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->load_state == UNIT_STUB) { + /* While we are created we still accept PIDs */ + + r = bus_scope_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_scope_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} + +int bus_scope_send_request_stop(Scope *s) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(s); + + if (!s->controller) + return 0; + + p = unit_dbus_path(UNIT(s)); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + UNIT(s)->manager->api_bus, + &m, + p, + "org.freedesktop.systemd1.Scope", + "RequestStop"); + if (r < 0) + return r; + + return sd_bus_send_to(UNIT(s)->manager->api_bus, m, s->controller, NULL); +} + +static int on_controller_gone(sd_bus_track *track, void *userdata) { + Scope *s = userdata; + + assert(track); + + if (s->controller) { + log_unit_debug(UNIT(s), "Controller %s disappeared from bus.", s->controller); + unit_add_to_dbus_queue(UNIT(s)); + s->controller = mfree(s->controller); + } + + s->controller_track = sd_bus_track_unref(s->controller_track); + + return 0; +} + +int bus_scope_track_controller(Scope *s) { + int r; + + assert(s); + + if (!s->controller || s->controller_track) + return 0; + + r = sd_bus_track_new(UNIT(s)->manager->api_bus, &s->controller_track, on_controller_gone, s); + if (r < 0) + return r; + + r = sd_bus_track_add_name(s->controller_track, s->controller); + if (r < 0) { + s->controller_track = sd_bus_track_unref(s->controller_track); + return r; + } + + return 0; +} diff --git a/src/core/dbus-scope.h b/src/core/dbus-scope.h new file mode 100644 index 0000000..8f1bc02 --- /dev/null +++ b/src/core/dbus-scope.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "scope.h" +#include "unit.h" + +extern const sd_bus_vtable bus_scope_vtable[]; + +int bus_scope_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_scope_commit_properties(Unit *u); + +int bus_scope_send_request_stop(Scope *s); + +int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_scope_track_controller(Scope *s); diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c new file mode 100644 index 0000000..64f9d4a --- /dev/null +++ b/src/core/dbus-service.c @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> + +#include "alloc-util.h" +#include "async.h" +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-manager.h" +#include "dbus-service.h" +#include "dbus-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "parse-util.h" +#include "path-util.h" +#include "service.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, service_type, ServiceType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, service_result, ServiceResult); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, ServiceRestart); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_notify_access, notify_access, NotifyAccess); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction); +static BUS_DEFINE_PROPERTY_GET(property_get_timeout_abort_usec, "t", Service, service_timeout_abort_usec); +static BUS_DEFINE_PROPERTY_GET(property_get_watchdog_usec, "t", Service, service_get_watchdog_usec); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode); + +static int property_get_exit_status_set( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const ExitStatusSet *status_set = userdata; + unsigned n; + int r; + + assert(bus); + assert(reply); + assert(status_set); + + r = sd_bus_message_open_container(reply, 'r', "aiai"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "i"); + if (r < 0) + return r; + + BITMAP_FOREACH(n, &status_set->status) { + assert(n < 256); + + r = sd_bus_message_append_basic(reply, 'i', &n); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "i"); + if (r < 0) + return r; + + BITMAP_FOREACH(n, &status_set->signal) { + const char *str; + + str = signal_to_string(n); + if (!str) + continue; + + r = sd_bus_message_append_basic(reply, 'i', &n); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_service_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Restart", "s", property_get_restart, offsetof(Service, restart), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PIDFile", "s", NULL, offsetof(Service, pid_file), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NotifyAccess", "s", property_get_notify_access, offsetof(Service, notify_access), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutAbortUSec", "t", property_get_timeout_abort_usec, 0, 0), + SD_BUS_PROPERTY("TimeoutStartFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_start_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutStopFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_stop_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WatchdogUSec", "t", property_get_watchdog_usec, 0, 0), + BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0), + SD_BUS_PROPERTY("PermissionsStartOnly", "b", bus_property_get_bool, offsetof(Service, permissions_start_only), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* 😷 deprecated */ + SD_BUS_PROPERTY("RootDirectoryStartOnly", "b", bus_property_get_bool, offsetof(Service, root_directory_start_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemainAfterExit", "b", bus_property_get_bool, offsetof(Service, remain_after_exit), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("GuessMainPID", "b", bus_property_get_bool, offsetof(Service, guess_main_pid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartPreventExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_prevent_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestartForceExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_force_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, success_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NFileDescriptorStore", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store), 0), + SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("StatusErrno", "i", bus_property_get_int, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ReloadResult", "s", property_get_result, offsetof(Service, reload_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CleanResult", "s", property_get_result, offsetof(Service, clean_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), + + BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecCondition", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecConditionEx", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPreEx", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartEx", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPostEx", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecReload", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecReloadEx", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStop", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopPostEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + + /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */ + SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_VTABLE_END +}; + +static int bus_set_transient_exit_status( + Unit *u, + const char *name, + ExitStatusSet *status_set, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const int32_t *status, *signal; + size_t n_status, n_signal, i; + int r; + + r = sd_bus_message_enter_container(message, 'r', "aiai"); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'i', (const void **) &status, &n_status); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'i', (const void **) &signal, &n_signal); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n_status /= sizeof(int32_t); + n_signal /= sizeof(int32_t); + + if (n_status == 0 && n_signal == 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) { + exit_status_set_free(status_set); + unit_write_settingf(u, flags, name, "%s=", name); + return 1; + } + + for (i = 0; i < n_status; i++) { + if (status[i] < 0 || status[i] > 255) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid status code in %s: %"PRIi32, name, status[i]); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bitmap_set(&status_set->status, status[i]); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%"PRIi32, name, status[i]); + } + } + + for (i = 0; i < n_signal; i++) { + const char *str; + + str = signal_to_string((int) signal[i]); + if (!str) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal in %s: %"PRIi32, name, signal[i]); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = bitmap_set(&status_set->signal, signal[i]); + if (r < 0) + return r; + + unit_write_settingf(u, flags, name, "%s=%s", name, str); + } + } + + return 1; +} + +static int bus_set_transient_std_fd( + Unit *u, + const char *name, + int *p, + bool *b, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int fd, r; + + assert(p); + assert(b); + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + asynchronous_close(*p); + *p = copy; + *b = true; + } + + return 1; +} +static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, sd_bus_service_name_is_valid); +static BUS_DEFINE_SET_TRANSIENT_PARSE(timeout_failure_mode, ServiceTimeoutFailureMode, service_timeout_failure_mode_from_string); + +static int bus_service_set_transient_property( + Service *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(s); + ServiceExecCommand ci; + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "PermissionsStartOnly")) + return bus_set_transient_bool(u, name, &s->permissions_start_only, message, flags, error); + + if (streq(name, "RootDirectoryStartOnly")) + return bus_set_transient_bool(u, name, &s->root_directory_start_only, message, flags, error); + + if (streq(name, "RemainAfterExit")) + return bus_set_transient_bool(u, name, &s->remain_after_exit, message, flags, error); + + if (streq(name, "GuessMainPID")) + return bus_set_transient_bool(u, name, &s->guess_main_pid, message, flags, error); + + if (streq(name, "Type")) + return bus_set_transient_service_type(u, name, &s->type, message, flags, error); + + if (streq(name, "OOMPolicy")) + return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error); + + if (streq(name, "RestartUSec")) + return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error); + + if (streq(name, "TimeoutStartUSec")) { + r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + s->start_timeout_defined = true; + + return r; + } + + if (streq(name, "TimeoutStopUSec")) + return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error); + + if (streq(name, "TimeoutAbortUSec")) { + r = bus_set_transient_usec(u, name, &s->timeout_abort_usec, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + s->timeout_abort_set = true; + return r; + } + + if (streq(name, "TimeoutStartFailureMode")) + return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_start_failure_mode, message, flags, error); + + if (streq(name, "TimeoutStopFailureMode")) + return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_stop_failure_mode, message, flags, error); + + if (streq(name, "RuntimeMaxUSec")) + return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error); + + if (streq(name, "WatchdogUSec")) + return bus_set_transient_usec(u, name, &s->watchdog_usec, message, flags, error); + + if (streq(name, "FileDescriptorStoreMax")) + return bus_set_transient_unsigned(u, name, &s->n_fd_store_max, message, flags, error); + + if (streq(name, "NotifyAccess")) + return bus_set_transient_notify_access(u, name, &s->notify_access, message, flags, error); + + if (streq(name, "PIDFile")) { + _cleanup_free_ char *n = NULL; + const char *v, *e; + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!isempty(v)) { + n = path_make_absolute(v, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!n) + return -ENOMEM; + + path_simplify(n, true); + + if (!path_is_normalized(n)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PIDFile= path '%s' is not valid", n); + + e = path_startswith(n, "/var/run/"); + if (e) { + char *z; + + z = path_join("/run", e); + if (!z) + return log_oom(); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + log_unit_notice(u, "Transient unit's PIDFile= property references path below legacy directory /var/run, updating %s → %s; please update client accordingly.", n, z); + + free_and_replace(n, z); + } + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + free_and_replace(s->pid_file, n); + unit_write_settingf(u, flags, name, "%s=%s", name, strempty(s->pid_file)); + } + + return 1; + } + + if (streq(name, "USBFunctionDescriptors")) + return bus_set_transient_path(u, name, &s->usb_function_descriptors, message, flags, error); + + if (streq(name, "USBFunctionStrings")) + return bus_set_transient_path(u, name, &s->usb_function_strings, message, flags, error); + + if (streq(name, "BusName")) + return bus_set_transient_bus_name(u, name, &s->bus_name, message, flags, error); + + if (streq(name, "Restart")) + return bus_set_transient_service_restart(u, name, &s->restart, message, flags, error); + + if (streq(name, "RestartPreventExitStatus")) + return bus_set_transient_exit_status(u, name, &s->restart_prevent_status, message, flags, error); + + if (streq(name, "RestartForceExitStatus")) + return bus_set_transient_exit_status(u, name, &s->restart_force_status, message, flags, error); + + if (streq(name, "SuccessExitStatus")) + return bus_set_transient_exit_status(u, name, &s->success_status, message, flags, error); + + ci = service_exec_command_from_string(name); + ci = (ci >= 0) ? ci : service_exec_ex_command_from_string(name); + if (ci >= 0) + return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error); + + if (streq(name, "StandardInputFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stdin_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "StandardOutputFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stdout_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + if (streq(name, "StandardErrorFileDescriptor")) + return bus_set_transient_std_fd(u, name, &s->stderr_fd, &s->exec_context.stdio_as_fds, message, flags, error); + + return 0; +} + +int bus_service_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Service *s = SERVICE(u); + int r; + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_service_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_service_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h new file mode 100644 index 0000000..6931167 --- /dev/null +++ b/src/core/dbus-service.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_service_vtable[]; + +int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); +int bus_service_commit_properties(Unit *u); diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c new file mode 100644 index 0000000..de41d65 --- /dev/null +++ b/src/core/dbus-slice.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-cgroup.h" +#include "dbus-slice.h" +#include "slice.h" +#include "unit.h" + +const sd_bus_vtable bus_slice_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_VTABLE_END +}; + +int bus_slice_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Slice *s = SLICE(u); + + assert(name); + assert(u); + + return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); +} + +int bus_slice_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-slice.h b/src/core/dbus-slice.h new file mode 100644 index 0000000..eb71916 --- /dev/null +++ b/src/core/dbus-slice.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_slice_vtable[]; + +int bus_slice_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_slice_commit_properties(Unit *u); diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c new file mode 100644 index 0000000..2c9da74 --- /dev/null +++ b/src/core/dbus-socket.c @@ -0,0 +1,485 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-kill.h" +#include "dbus-socket.h" +#include "dbus-util.h" +#include "fd-util.h" +#include "ip-protocol-list.h" +#include "parse-util.h" +#include "path-util.h" +#include "socket.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "string-util.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, socket_result, SocketResult); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_bind_ipv6_only, socket_address_bind_ipv6_only, SocketAddressBindIPv6Only); +static BUS_DEFINE_PROPERTY_GET(property_get_fdname, "s", Socket, socket_fdname); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timestamping, socket_timestamping, SocketTimestamping); + +static int property_get_listen( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Socket *s = SOCKET(userdata); + SocketPort *p; + int r; + + assert(bus); + assert(reply); + assert(s); + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(port, p, s->ports) { + _cleanup_free_ char *address = NULL; + const char *a; + + switch (p->type) { + case SOCKET_SOCKET: { + r = socket_address_print(&p->address, &address); + if (r) + return r; + + a = address; + break; + } + + case SOCKET_SPECIAL: + case SOCKET_MQUEUE: + case SOCKET_FIFO: + case SOCKET_USB_FUNCTION: + a = p->path; + break; + + default: + assert_not_reached("Unknown socket type"); + } + + r = sd_bus_message_append(reply, "(ss)", socket_port_type_to_string(p), a); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_socket_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("BindIPv6Only", "s", property_get_bind_ipv6_only, offsetof(Socket, bind_ipv6_only), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Backlog", "u", bus_property_get_unsigned, offsetof(Socket, backlog), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Socket, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindToDevice", "s", NULL, offsetof(Socket, bind_to_device), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketUser", "s", NULL, offsetof(Socket, user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketGroup", "s", NULL, offsetof(Socket, group), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SocketMode", "u", bus_property_get_mode, offsetof(Socket, socket_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Socket, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Accept", "b", bus_property_get_bool, offsetof(Socket, accept), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FlushPending", "b", bus_property_get_bool, offsetof(Socket, flush_pending), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Writable", "b", bus_property_get_bool, offsetof(Socket, writable), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAlive", "b", bus_property_get_bool, offsetof(Socket, keep_alive), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveTimeUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_time), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KeepAliveProbes", "u", bus_property_get_unsigned, offsetof(Socket, keep_alive_cnt), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DeferAcceptUSec" , "t", bus_property_get_usec, offsetof(Socket, defer_accept), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NoDelay", "b", bus_property_get_bool, offsetof(Socket, no_delay), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Priority", "i", bus_property_get_int, offsetof(Socket, priority), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReceiveBuffer", "t", bus_property_get_size, offsetof(Socket, receive_buffer), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SendBuffer", "t", bus_property_get_size, offsetof(Socket, send_buffer), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPTOS", "i", bus_property_get_int, offsetof(Socket, ip_tos), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IPTTL", "i", bus_property_get_int, offsetof(Socket, ip_ttl), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PipeSize", "t", bus_property_get_size, offsetof(Socket, pipe_size), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FreeBind", "b", bus_property_get_bool, offsetof(Socket, free_bind), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveOnStop", "b", bus_property_get_bool, offsetof(Socket, remove_on_stop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Listen", "a(ss)", property_get_listen, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Symlinks", "as", NULL, offsetof(Socket, symlinks), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Mark", "i", bus_property_get_int, offsetof(Socket, mark), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MaxConnections", "u", bus_property_get_unsigned, offsetof(Socket, max_connections), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MaxConnectionsPerSource", "u", bus_property_get_unsigned, offsetof(Socket, max_connections_per_source), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MessageQueueMaxMessages", "x", bus_property_get_long, offsetof(Socket, mq_maxmsg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MessageQueueMessageSize", "x", bus_property_get_long, offsetof(Socket, mq_msgsize), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TCPCongestion", "s", NULL, offsetof(Socket, tcp_congestion), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReusePort", "b", bus_property_get_bool, offsetof(Socket, reuse_port), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabel", "s", NULL, offsetof(Socket, smack), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabelIPIn", "s", NULL, offsetof(Socket, smack_ip_in), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SmackLabelIPOut", "s", NULL, offsetof(Socket, smack_ip_out), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Socket, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Socket, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NConnections", "u", bus_property_get_unsigned, offsetof(Socket, n_connections), 0), + SD_BUS_PROPERTY("NAccepted", "u", bus_property_get_unsigned, offsetof(Socket, n_accepted), 0), + SD_BUS_PROPERTY("NRefused", "u", bus_property_get_unsigned, offsetof(Socket, n_refused), 0), + SD_BUS_PROPERTY("FileDescriptorName", "s", property_get_fdname, 0, 0), + SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static bool check_size_t_truncation(uint64_t t) { + return (size_t) t == t; +} + +static const char* socket_protocol_to_string(int32_t i) { + if (i == IPPROTO_IP) + return ""; + + if (!IN_SET(i, IPPROTO_UDPLITE, IPPROTO_SCTP)) + return NULL; + + return ip_protocol_to_name(i); +} + +static BUS_DEFINE_SET_TRANSIENT(int, "i", int32_t, int, "%" PRIi32); +static BUS_DEFINE_SET_TRANSIENT(message_queue, "x", int64_t, long, "%" PRIi64); +static BUS_DEFINE_SET_TRANSIENT_IS_VALID(size_t_check_truncation, "t", uint64_t, size_t, "%" PRIu64, check_size_t_truncation); +static BUS_DEFINE_SET_TRANSIENT_PARSE(bind_ipv6_only, SocketAddressBindIPv6Only, socket_address_bind_ipv6_only_or_bool_from_string); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(fdname, fdname_is_valid); +static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(ifname, ifname_valid); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(ip_tos, "i", int32_t, int, "%" PRIi32, ip_tos_to_string_alloc); +static BUS_DEFINE_SET_TRANSIENT_TO_STRING(socket_protocol, "i", int32_t, int, "%" PRIi32, socket_protocol_to_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(socket_timestamping, SocketTimestamping, socket_timestamping_from_string_harder); + +static int bus_socket_set_transient_property( + Socket *s, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + SocketExecCommand ci; + Unit *u = UNIT(s); + int r; + + assert(s); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "Accept")) + return bus_set_transient_bool(u, name, &s->accept, message, flags, error); + + if (streq(name, "FlushPending")) + return bus_set_transient_bool(u, name, &s->flush_pending, message, flags, error); + + if (streq(name, "Writable")) + return bus_set_transient_bool(u, name, &s->writable, message, flags, error); + + if (streq(name, "KeepAlive")) + return bus_set_transient_bool(u, name, &s->keep_alive, message, flags, error); + + if (streq(name, "NoDelay")) + return bus_set_transient_bool(u, name, &s->no_delay, message, flags, error); + + if (streq(name, "FreeBind")) + return bus_set_transient_bool(u, name, &s->free_bind, message, flags, error); + + if (streq(name, "Transparent")) + return bus_set_transient_bool(u, name, &s->transparent, message, flags, error); + + if (streq(name, "Broadcast")) + return bus_set_transient_bool(u, name, &s->broadcast, message, flags, error); + + if (streq(name, "PassCredentials")) + return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error); + + if (streq(name, "PassSecurity")) + return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error); + + if (streq(name, "PassPacketInfo")) + return bus_set_transient_bool(u, name, &s->pass_pktinfo, message, flags, error); + + if (streq(name, "Timestamping")) + return bus_set_transient_socket_timestamping(u, name, &s->timestamping, message, flags, error); + + if (streq(name, "ReusePort")) + return bus_set_transient_bool(u, name, &s->reuse_port, message, flags, error); + + if (streq(name, "RemoveOnStop")) + return bus_set_transient_bool(u, name, &s->remove_on_stop, message, flags, error); + + if (streq(name, "SELinuxContextFromNet")) + return bus_set_transient_bool(u, name, &s->selinux_context_from_net, message, flags, error); + + if (streq(name, "Priority")) + return bus_set_transient_int(u, name, &s->priority, message, flags, error); + + if (streq(name, "IPTTL")) + return bus_set_transient_int(u, name, &s->ip_ttl, message, flags, error); + + if (streq(name, "Mark")) + return bus_set_transient_int(u, name, &s->mark, message, flags, error); + + if (streq(name, "Backlog")) + return bus_set_transient_unsigned(u, name, &s->backlog, message, flags, error); + + if (streq(name, "MaxConnections")) + return bus_set_transient_unsigned(u, name, &s->max_connections, message, flags, error); + + if (streq(name, "MaxConnectionsPerSource")) + return bus_set_transient_unsigned(u, name, &s->max_connections_per_source, message, flags, error); + + if (streq(name, "KeepAliveProbes")) + return bus_set_transient_unsigned(u, name, &s->keep_alive_cnt, message, flags, error); + + if (streq(name, "TriggerLimitBurst")) + return bus_set_transient_unsigned(u, name, &s->trigger_limit.burst, message, flags, error); + + if (streq(name, "SocketMode")) + return bus_set_transient_mode_t(u, name, &s->socket_mode, message, flags, error); + + if (streq(name, "DirectoryMode")) + return bus_set_transient_mode_t(u, name, &s->directory_mode, message, flags, error); + + if (streq(name, "MessageQueueMaxMessages")) + return bus_set_transient_message_queue(u, name, &s->mq_maxmsg, message, flags, error); + + if (streq(name, "MessageQueueMessageSize")) + return bus_set_transient_message_queue(u, name, &s->mq_msgsize, message, flags, error); + + if (streq(name, "TimeoutUSec")) + return bus_set_transient_usec_fix_0(u, name, &s->timeout_usec, message, flags, error); + + if (streq(name, "KeepAliveTimeUSec")) + return bus_set_transient_usec(u, name, &s->keep_alive_time, message, flags, error); + + if (streq(name, "KeepAliveIntervalUSec")) + return bus_set_transient_usec(u, name, &s->keep_alive_interval, message, flags, error); + + if (streq(name, "DeferAcceptUSec")) + return bus_set_transient_usec(u, name, &s->defer_accept, message, flags, error); + + if (streq(name, "TriggerLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &s->trigger_limit.interval, message, flags, error); + + if (streq(name, "SmackLabel")) + return bus_set_transient_string(u, name, &s->smack, message, flags, error); + + if (streq(name, "SmackLabelIPin")) + return bus_set_transient_string(u, name, &s->smack_ip_in, message, flags, error); + + if (streq(name, "SmackLabelIPOut")) + return bus_set_transient_string(u, name, &s->smack_ip_out, message, flags, error); + + if (streq(name, "TCPCongestion")) + return bus_set_transient_string(u, name, &s->tcp_congestion, message, flags, error); + + if (streq(name, "FileDescriptorName")) + return bus_set_transient_fdname(u, name, &s->fdname, message, flags, error); + + if (streq(name, "SocketUser")) + return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error); + + if (streq(name, "SocketGroup")) + return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error); + + if (streq(name, "BindIPv6Only")) + return bus_set_transient_bind_ipv6_only(u, name, &s->bind_ipv6_only, message, flags, error); + + if (streq(name, "ReceiveBuffer")) + return bus_set_transient_size_t_check_truncation(u, name, &s->receive_buffer, message, flags, error); + + if (streq(name, "SendBuffer")) + return bus_set_transient_size_t_check_truncation(u, name, &s->send_buffer, message, flags, error); + + if (streq(name, "PipeSize")) + return bus_set_transient_size_t_check_truncation(u, name, &s->pipe_size, message, flags, error); + + if (streq(name, "BindToDevice")) + return bus_set_transient_ifname(u, name, &s->bind_to_device, message, flags, error); + + if (streq(name, "IPTOS")) + return bus_set_transient_ip_tos(u, name, &s->ip_tos, message, flags, error); + + if (streq(name, "SocketProtocol")) + return bus_set_transient_socket_protocol(u, name, &s->socket_protocol, message, flags, error); + + ci = socket_exec_command_from_string(name); + if (ci >= 0) + return bus_set_transient_exec_command(u, name, + &s->exec_command[ci], + message, flags, error); + + if (streq(name, "Symlinks")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Symlink path is not absolute: %s", *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + s->symlinks = strv_free(s->symlinks); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=", name); + } else { + _cleanup_free_ char *joined = NULL; + + r = strv_extend_strv(&s->symlinks, l, true); + if (r < 0) + return -ENOMEM; + + joined = strv_join(l, " "); + if (!joined) + return -ENOMEM; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined); + } + } + + return 1; + + } else if (streq(name, "Listen")) { + const char *t, *a; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &t, &a)) > 0) { + _cleanup_free_ SocketPort *p = NULL; + + p = new(SocketPort, 1); + if (!p) + return log_oom(); + + *p = (SocketPort) { + .fd = -1, + .socket = s, + }; + + p->type = socket_port_type_from_string(t); + if (p->type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown Socket type: %s", t); + + if (p->type != SOCKET_SOCKET) { + if (!path_is_valid(p->path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid socket path: %s", t); + + p->path = strdup(a); + if (!p->path) + return log_oom(); + + path_simplify(p->path, false); + + } else if (streq(t, "Netlink")) { + r = socket_address_parse_netlink(&p->address, a); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid netlink address: %s", a); + + } else { + r = socket_address_parse(&p->address, a); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address: %s", a); + + p->address.type = socket_address_type_from_string(t); + if (p->address.type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address type: %s", t); + + if (socket_address_family(&p->address) != AF_LOCAL && p->address.type == SOCK_SEQPACKET) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Address family not supported: %s", a); + } + + empty = false; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + LIST_APPEND(port, s->ports, TAKE_PTR(p)); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Listen%s=%s", t, a); + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + socket_free_ports(s); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ListenStream="); + } + + return 1; + } + + return 0; +} + +int bus_socket_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(name); + assert(message); + + assert(s); + assert(name); + assert(message); + + r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); + if (r != 0) + return r; + + if (u->transient && u->load_state == UNIT_STUB) { + /* This is a transient unit, let's load a little more */ + + r = bus_socket_set_transient_property(s, name, message, flags, error); + if (r != 0) + return r; + + r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error); + if (r != 0) + return r; + + r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error); + if (r != 0) + return r; + } + + return 0; +} + +int bus_socket_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-socket.h b/src/core/dbus-socket.h new file mode 100644 index 0000000..f9f36a2 --- /dev/null +++ b/src/core/dbus-socket.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_socket_vtable[]; + +int bus_socket_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_socket_commit_properties(Unit *u); diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c new file mode 100644 index 0000000..0fa8dd1 --- /dev/null +++ b/src/core/dbus-swap.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "bus-get-properties.h" +#include "dbus-cgroup.h" +#include "dbus-execute.h" +#include "dbus-swap.h" +#include "string-util.h" +#include "swap.h" +#include "unit.h" + +static int swap_get_priority(Swap *s) { + assert(s); + + if (s->from_proc_swaps && s->parameters_proc_swaps.priority_set) + return s->parameters_proc_swaps.priority; + + if (s->from_fragment && s->parameters_fragment.priority_set) + return s->parameters_fragment.priority; + + return -1; +} + +static const char *swap_get_options(Swap *s) { + assert(s); + + if (s->from_fragment) + return s->parameters_fragment.options; + + return NULL; +} + +static BUS_DEFINE_PROPERTY_GET(property_get_priority, "i", Swap, swap_get_priority); +static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Swap, swap_get_options); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, swap_result, SwapResult); + +const sd_bus_vtable bus_swap_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("What", "s", NULL, offsetof(Swap, what), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Priority", "i", property_get_priority, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +int bus_swap_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Swap *s = SWAP(u); + + assert(s); + assert(name); + assert(message); + + return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error); +} + +int bus_swap_commit_properties(Unit *u) { + assert(u); + + unit_realize_cgroup(u); + + return 0; +} diff --git a/src/core/dbus-swap.h b/src/core/dbus-swap.h new file mode 100644 index 0000000..9d651b5 --- /dev/null +++ b/src/core/dbus-swap.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_swap_vtable[]; + +int bus_swap_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_swap_commit_properties(Unit *u); diff --git a/src/core/dbus-target.c b/src/core/dbus-target.c new file mode 100644 index 0000000..e979fb7 --- /dev/null +++ b/src/core/dbus-target.c @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-target.h" +#include "unit.h" + +const sd_bus_vtable bus_target_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_VTABLE_END +}; diff --git a/src/core/dbus-target.h b/src/core/dbus-target.h new file mode 100644 index 0000000..fedd4a9 --- /dev/null +++ b/src/core/dbus-target.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus-vtable.h" + +extern const sd_bus_vtable bus_target_vtable[]; diff --git a/src/core/dbus-timer.c b/src/core/dbus-timer.c new file mode 100644 index 0000000..8e69c17 --- /dev/null +++ b/src/core/dbus-timer.c @@ -0,0 +1,382 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "dbus-timer.h" +#include "dbus-util.h" +#include "strv.h" +#include "timer.h" +#include "unit.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, timer_result, TimerResult); + +static int property_get_monotonic_timers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = userdata; + TimerValue *v; + int r; + + assert(bus); + assert(reply); + assert(t); + + r = sd_bus_message_open_container(reply, 'a', "(stt)"); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + _cleanup_free_ char *buf = NULL; + const char *s; + size_t l; + + if (v->base == TIMER_CALENDAR) + continue; + + s = timer_base_to_string(v->base); + assert(endswith(s, "Sec")); + + /* s/Sec/USec/ */ + l = strlen(s); + buf = new(char, l+2); + if (!buf) + return -ENOMEM; + + memcpy(buf, s, l-3); + memcpy(buf+l-3, "USec", 5); + + r = sd_bus_message_append(reply, "(stt)", buf, v->value, v->next_elapse); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_calendar_timers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = userdata; + TimerValue *v; + int r; + + assert(bus); + assert(reply); + assert(t); + + r = sd_bus_message_open_container(reply, 'a', "(sst)"); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + _cleanup_free_ char *buf = NULL; + + if (v->base != TIMER_CALENDAR) + continue; + + r = calendar_spec_to_string(v->calendar_spec, &buf); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "(sst)", timer_base_to_string(v->base), buf, v->next_elapse); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_next_elapse_monotonic( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Timer *t = userdata; + + assert(bus); + assert(reply); + assert(t); + + return sd_bus_message_append(reply, "t", + (uint64_t) usec_shift_clock(t->next_elapse_monotonic_or_boottime, + TIMER_MONOTONIC_CLOCK(t), CLOCK_MONOTONIC)); +} + +const sd_bus_vtable bus_timer_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TimersMonotonic", "a(stt)", property_get_monotonic_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("TimersCalendar", "a(sst)", property_get_calendar_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("OnClockChange", "b", bus_property_get_bool, offsetof(Timer, on_clock_change), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnTimezoneChange", "b", bus_property_get_bool, offsetof(Timer, on_timezone_change), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NextElapseUSecRealtime", "t", bus_property_get_usec, offsetof(Timer, next_elapse_realtime), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NextElapseUSecMonotonic", "t", property_get_next_elapse_monotonic, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("LastTriggerUSec", offsetof(Timer, last_trigger), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Timer, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AccuracyUSec", "t", bus_property_get_usec, offsetof(Timer, accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Timer, random_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FixedRandomDelay", "b", bus_property_get_bool, offsetof(Timer, fixed_random_delay), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Persistent", "b", bus_property_get_bool, offsetof(Timer, persistent), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WakeSystem", "b", bus_property_get_bool, offsetof(Timer, wake_system), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemainAfterElapse", "b", bus_property_get_bool, offsetof(Timer, remain_after_elapse), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_VTABLE_END +}; + +static int timer_add_one_monotonic_spec( + Timer *t, + const char *name, + TimerBase base, + UnitWriteFlags flags, + usec_t usec, + sd_bus_error *error) { + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char ts[FORMAT_TIMESPAN_MAX]; + TimerValue *v; + + unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", + timer_base_to_string(base), + format_timespan(ts, sizeof ts, usec, USEC_PER_MSEC)); + + v = new(TimerValue, 1); + if (!v) + return -ENOMEM; + + *v = (TimerValue) { + .base = base, + .value = usec, + }; + + LIST_PREPEND(value, t->values, v); + } + + return 1; +} + +static int timer_add_one_calendar_spec( + Timer *t, + const char *name, + TimerBase base, + UnitWriteFlags flags, + const char *str, + sd_bus_error *error) { + + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + int r; + + r = calendar_spec_from_string(str, &c); + if (r == -EINVAL) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec"); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", timer_base_to_string(base), str); + + TimerValue *v = new(TimerValue, 1); + if (!v) + return -ENOMEM; + + *v = (TimerValue) { + .base = base, + .calendar_spec = TAKE_PTR(c), + }; + + LIST_PREPEND(value, t->values, v); + } + + return 1; +}; + +static int bus_timer_set_transient_property( + Timer *t, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + Unit *u = UNIT(t); + int r; + + assert(t); + assert(name); + assert(message); + + flags |= UNIT_PRIVATE; + + if (streq(name, "AccuracyUSec")) + return bus_set_transient_usec(u, name, &t->accuracy_usec, message, flags, error); + + if (streq(name, "AccuracySec")) { + log_notice("Client is using obsolete AccuracySec= transient property, please use AccuracyUSec= instead."); + return bus_set_transient_usec(u, "AccuracyUSec", &t->accuracy_usec, message, flags, error); + } + + if (streq(name, "RandomizedDelayUSec")) + return bus_set_transient_usec(u, name, &t->random_usec, message, flags, error); + + if (streq(name, "FixedRandomDelay")) + return bus_set_transient_bool(u, name, &t->fixed_random_delay, message, flags, error); + + if (streq(name, "WakeSystem")) + return bus_set_transient_bool(u, name, &t->wake_system, message, flags, error); + + if (streq(name, "Persistent")) + return bus_set_transient_bool(u, name, &t->persistent, message, flags, error); + + if (streq(name, "RemainAfterElapse")) + return bus_set_transient_bool(u, name, &t->remain_after_elapse, message, flags, error); + + if (streq(name, "OnTimezoneChange")) + return bus_set_transient_bool(u, name, &t->on_timezone_change, message, flags, error); + + if (streq(name, "OnClockChange")) + return bus_set_transient_bool(u, name, &t->on_clock_change, message, flags, error); + + if (streq(name, "TimersMonotonic")) { + const char *base_name; + usec_t usec; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(st)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(st)", &base_name, &usec)) > 0) { + TimerBase b; + + b = timer_base_from_string(base_name); + if (b < 0 || b == TIMER_CALENDAR) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid timer base: %s", base_name); + + r = timer_add_one_monotonic_spec(t, name, b, flags, usec, error); + if (r < 0) + return r; + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + timer_free_values(t); + unit_write_setting(u, flags, name, "OnActiveSec="); + } + + return 1; + + } else if (streq(name, "TimersCalendar")) { + const char *base_name, *str; + bool empty = true; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &base_name, &str)) > 0) { + TimerBase b; + + b = timer_base_from_string(base_name); + if (b != TIMER_CALENDAR) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid timer base: %s", base_name); + + r = timer_add_one_calendar_spec(t, name, b, flags, str, error); + if (r < 0) + return r; + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + timer_free_values(t); + unit_write_setting(u, flags, name, "OnCalendar="); + } + + return 1; + + } else if (STR_IN_SET(name, + "OnActiveSec", + "OnBootSec", + "OnStartupSec", + "OnUnitActiveSec", + "OnUnitInactiveSec")) { + + TimerBase b; + usec_t usec; + + log_notice("Client is using obsolete %s= transient property, please use TimersMonotonic= instead.", name); + + b = timer_base_from_string(name); + if (b < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown timer base"); + + r = sd_bus_message_read(message, "t", &usec); + if (r < 0) + return r; + + return timer_add_one_monotonic_spec(t, name, b, flags, usec, error); + + } else if (streq(name, "OnCalendar")) { + + const char *str; + + log_notice("Client is using obsolete %s= transient property, please use TimersCalendar= instead.", name); + + r = sd_bus_message_read(message, "s", &str); + if (r < 0) + return r; + + return timer_add_one_calendar_spec(t, name, TIMER_CALENDAR, flags, str, error); + } + + return 0; +} + +int bus_timer_set_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags mode, + sd_bus_error *error) { + + Timer *t = TIMER(u); + + assert(t); + assert(name); + assert(message); + + if (u->transient && u->load_state == UNIT_STUB) + return bus_timer_set_transient_property(t, name, message, mode, error); + + return 0; +} diff --git a/src/core/dbus-timer.h b/src/core/dbus-timer.h new file mode 100644 index 0000000..ac436f1 --- /dev/null +++ b/src/core/dbus-timer.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-bus-vtable.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_timer_vtable[]; + +int bus_timer_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c new file mode 100644 index 0000000..427152a --- /dev/null +++ b/src/core/dbus-unit.c @@ -0,0 +1,2475 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "condition.h" +#include "dbus-job.h" +#include "dbus-unit.h" +#include "dbus-util.h" +#include "dbus.h" +#include "fd-util.h" +#include "install.h" +#include "locale-util.h" +#include "log.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-access.h" +#include "signal-util.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" +#include "web-util.h" + +static bool unit_can_start_refuse_manual(Unit *u) { + return unit_can_start(u) && !u->refuse_manual_start; +} + +static bool unit_can_stop_refuse_manual(Unit *u) { + return unit_can_stop(u) && !u->refuse_manual_stop; +} + +static bool unit_can_isolate_refuse_manual(Unit *u) { + return unit_can_isolate(u) && !u->refuse_manual_start; +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_collect_mode, collect_mode, CollectMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_load_state, unit_load_state, UnitLoadState); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_job_mode, job_mode, JobMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction); +static BUS_DEFINE_PROPERTY_GET(property_get_description, "s", Unit, unit_description); +static BUS_DEFINE_PROPERTY_GET2(property_get_active_state, "s", Unit, unit_active_state, unit_active_state_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_freezer_state, "s", Unit, unit_freezer_state, freezer_state_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_sub_state, "s", Unit, unit_sub_state_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_unit_file_state, "s", Unit, unit_get_unit_file_state, unit_file_state_to_string); +static BUS_DEFINE_PROPERTY_GET(property_get_can_reload, "b", Unit, unit_can_reload); +static BUS_DEFINE_PROPERTY_GET(property_get_can_start, "b", Unit, unit_can_start_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_stop, "b", Unit, unit_can_stop_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_isolate, "b", Unit, unit_can_isolate_refuse_manual); +static BUS_DEFINE_PROPERTY_GET(property_get_can_freeze, "b", Unit, unit_can_freeze); +static BUS_DEFINE_PROPERTY_GET(property_get_need_daemon_reload, "b", Unit, unit_need_daemon_reload); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_strv, "as", 0); + +static int property_get_can_clean( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + ExecCleanMask mask; + int r; + + assert(bus); + assert(reply); + + r = unit_can_clean(u, &mask); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!FLAGS_SET(mask, 1U << t)) + continue; + + r = sd_bus_message_append(reply, "s", exec_resource_type_to_string(t)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_names( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + const char *t; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", u->id); + if (r < 0) + return r; + + SET_FOREACH(t, u->aliases) { + r = sd_bus_message_append(reply, "s", t); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_following( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *f; + + assert(bus); + assert(reply); + assert(u); + + f = unit_following(u); + return sd_bus_message_append(reply, "s", f ? f->id : NULL); +} + +static int property_get_dependencies( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Hashmap **h = userdata; + Unit *u; + void *v; + int r; + + assert(bus); + assert(reply); + assert(h); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(v, u, *h) { + r = sd_bus_message_append(reply, "s", u->id); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_requires_mounts_for( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Hashmap **h = userdata; + const char *p; + void *v; + int r; + + assert(bus); + assert(reply); + assert(h); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(v, p, *h) { + r = sd_bus_message_append(reply, "s", p); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_unit_file_preset( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_unit_file_preset(u); + + return sd_bus_message_append(reply, "s", + r < 0 ? NULL: + r > 0 ? "enabled" : "disabled"); +} + +static int property_get_job( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Job **j = userdata; + + assert(bus); + assert(reply); + assert(j); + + if (!*j) + return sd_bus_message_append(reply, "(uo)", 0, "/"); + + p = job_dbus_path(*j); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(uo)", (*j)->id, p); +} + +static int property_get_conditions( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const char *(*to_string)(ConditionType type) = NULL; + Condition **list = userdata, *c; + int r; + + assert(bus); + assert(reply); + assert(list); + + to_string = streq(property, "Asserts") ? assert_type_to_string : condition_type_to_string; + + r = sd_bus_message_open_container(reply, 'a', "(sbbsi)"); + if (r < 0) + return r; + + LIST_FOREACH(conditions, c, *list) { + int tristate; + + tristate = + c->result == CONDITION_UNTESTED ? 0 : + c->result == CONDITION_SUCCEEDED ? 1 : -1; + + r = sd_bus_message_append(reply, "(sbbsi)", + to_string(c->type), + c->trigger, c->negate, + c->parameter, tristate); + if (r < 0) + return r; + + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_load_error( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = bus_unit_validate_load_state(u, &e); + if (r < 0) + return sd_bus_message_append(reply, "(ss)", e.name, e.message); + + return sd_bus_message_append(reply, "(ss)", NULL, NULL); +} + +static int bus_verify_manage_units_async_full( + Unit *u, + const char *verb, + int capability, + const char *polkit_message, + bool interactive, + sd_bus_message *call, + sd_bus_error *error) { + + const char *details[9] = { + "unit", u->id, + "verb", verb, + }; + + if (polkit_message) { + details[4] = "polkit.message"; + details[5] = polkit_message; + details[6] = "polkit.gettext_domain"; + details[7] = GETTEXT_PACKAGE; + } + + return bus_verify_polkit_async( + call, + capability, + "org.freedesktop.systemd1.manage-units", + details, + interactive, + UID_INVALID, + &u->manager->polkit_registry, + error); +} + +static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = { + [JOB_START] = N_("Authentication is required to start '$(unit)'."), + [JOB_STOP] = N_("Authentication is required to stop '$(unit)'."), + [JOB_RELOAD] = N_("Authentication is required to reload '$(unit)'."), + [JOB_RESTART] = N_("Authentication is required to restart '$(unit)'."), + [JOB_TRY_RESTART] = N_("Authentication is required to restart '$(unit)'."), +}; + +int bus_unit_method_start_generic( + sd_bus_message *message, + Unit *u, + JobType job_type, + bool reload_if_possible, + sd_bus_error *error) { + + const char *smode, *verb; + JobMode mode; + int r; + + assert(message); + assert(u); + assert(job_type >= 0 && job_type < _JOB_TYPE_MAX); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(job_type), + error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &smode); + if (r < 0) + return r; + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode); + + if (reload_if_possible) + verb = strjoina("reload-or-", job_type_to_string(job_type)); + else + verb = job_type_to_string(job_type); + + r = bus_verify_manage_units_async_full( + u, + verb, + CAP_SYS_ADMIN, + polkit_message_for_job[job_type], + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + return bus_unit_queue_job(message, u, job_type, mode, + reload_if_possible ? BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE : 0, error); +} + +static int method_start(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_START, false, error); +} + +static int method_stop(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_STOP, false, error); +} + +static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RELOAD, false, error); +} + +static int method_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RESTART, false, error); +} + +static int method_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, false, error); +} + +static int method_reload_or_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_RESTART, true, error); +} + +static int method_reload_or_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, true, error); +} + +int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error) { + BusUnitQueueFlags flags = BUS_UNIT_QUEUE_VERBOSE_REPLY; + const char *jtype, *smode; + Unit *u = userdata; + JobType type; + JobMode mode; + int r; + + assert(message); + assert(u); + + r = sd_bus_message_read(message, "ss", &jtype, &smode); + if (r < 0) + return r; + + /* Parse the two magic reload types "reload-or-…" manually */ + if (streq(jtype, "reload-or-restart")) { + type = JOB_RESTART; + flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE; + } else if (streq(jtype, "reload-or-try-restart")) { + type = JOB_TRY_RESTART; + flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE; + } else { + /* And the rest generically */ + type = job_type_from_string(jtype); + if (type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job type %s invalid", jtype); + } + + mode = job_mode_from_string(smode); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(type), + error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + jtype, + CAP_SYS_ADMIN, + polkit_message_for_job[type], + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + return bus_unit_queue_job(message, u, type, mode, flags, error); +} + +int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + const char *swho; + int32_t signo; + KillWho who; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "stop", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "si", &swho, &signo); + if (r < 0) + return r; + + if (isempty(swho)) + who = KILL_ALL; + else { + who = kill_who_from_string(swho); + if (who < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid who argument %s", swho); + } + + if (!SIGNAL_VALID(signo)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Signal number out of range."); + + r = bus_verify_manage_units_async_full( + u, + "kill", + CAP_KILL, + N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_kill(u, who, signo, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "reload", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "reset-failed", + CAP_SYS_ADMIN, + N_("Authentication is required to reset the \"failed\" state of '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + unit_reset_failed(u); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int runtime, r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "set-property", + CAP_SYS_ADMIN, + N_("Authentication is required to set properties on '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = bus_unit_set_properties(u, message, runtime ? UNIT_RUNTIME : UNIT_PERSISTENT, true, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "ref", + CAP_SYS_ADMIN, + NULL, + false, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = bus_unit_track_add_sender(u, message); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = bus_unit_track_remove_sender(u, message); + if (r == -EUNATCH) + return sd_bus_error_setf(error, BUS_ERROR_NOT_REFERENCED, "Unit has not been referenced yet."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error) { + ExecCleanMask mask = 0; + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = mac_selinux_unit_access_check(u, message, "stop", error); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *i; + + r = sd_bus_message_read(message, "s", &i); + if (r < 0) + return r; + if (r == 0) + break; + + if (streq(i, "all")) + mask |= EXEC_CLEAN_ALL; + else { + ExecDirectoryType t; + + t = exec_resource_type_from_string(i); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid resource type: %s", i); + + mask |= 1U << t; + } + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + "clean", + CAP_DAC_OVERRIDE, + N_("Authentication is required to delete files and directories associated with '$(unit)'."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = unit_clean(u, mask); + if (r == -EOPNOTSUPP) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not supporting cleaning.", u->id); + if (r == -EUNATCH) + return sd_bus_error_setf(error, BUS_ERROR_NOTHING_TO_CLEAN, "No matching resources found."); + if (r == -EBUSY) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_BUSY, "Unit is not inactive or has pending job."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) { + const char* perm; + int (*method)(Unit*); + Unit *u = userdata; + bool reply_no_delay = false; + int r; + + assert(message); + assert(u); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + if (action == FREEZER_FREEZE) { + perm = "stop"; + method = unit_freeze; + } else { + perm = "start"; + method = unit_thaw; + } + + r = mac_selinux_unit_access_check(u, message, perm, error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async_full( + u, + perm, + CAP_SYS_ADMIN, + N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."), + true, + message, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = method(u); + if (r == -EOPNOTSUPP) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id); + if (r == -EBUSY) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job."); + if (r == -EHOSTDOWN) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive."); + if (r == -EALREADY) + return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id); + if (r < 0) + return r; + if (r == 0) + reply_no_delay = true; + + assert(!u->pending_freezer_message); + + r = sd_bus_message_new_method_return(message, &u->pending_freezer_message); + if (r < 0) + return r; + + if (reply_no_delay) { + r = bus_unit_send_pending_freezer_message(u); + if (r < 0) + return r; + } + + return 1; +} + +int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_THAW); +} + +int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_FREEZE); +} + +static int property_get_refs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + const char *i; + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (i = sd_bus_track_first(u->bus_track); i; i = sd_bus_track_next(u->bus_track)) { + int c, k; + + c = sd_bus_track_count_name(u->bus_track, i); + if (c < 0) + return c; + + /* Add the item multiple times if the ref count for each is above 1 */ + for (k = 0; k < c; k++) { + r = sd_bus_message_append(reply, "s", i); + if (r < 0) + return r; + } + } + + return sd_bus_message_close_container(reply); +} + +const sd_bus_vtable bus_unit_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Unit, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Names", "as", property_get_names, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Following", "s", property_get_following, 0, 0), + SD_BUS_PROPERTY("Requires", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUIRES]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Requisite", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUISITE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Wants", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_WANTS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BindsTo", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_BINDS_TO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PartOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_PART_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiredBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUIRED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequisiteOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_REQUISITE_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WantedBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_WANTED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("BoundBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_BOUND_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConsistsOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_CONSISTS_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Conflicts", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_CONFLICTS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConflictedBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_CONFLICTED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Before", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_BEFORE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("After", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_AFTER]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailure", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_ON_FAILURE]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Triggers", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_TRIGGERS]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TriggeredBy", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_TRIGGERED_BY]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PropagatesReloadTo", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_PROPAGATES_RELOAD_TO]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ReloadPropagatedFrom", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_RELOAD_PROPAGATED_FROM]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, offsetof(Unit, dependencies[UNIT_JOINS_NAMESPACE_OF]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LoadState", "s", property_get_load_state, offsetof(Unit, load_state), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ActiveState", "s", property_get_active_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FreezerState", "s", property_get_freezer_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("SubState", "s", property_get_sub_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FragmentPath", "s", NULL, offsetof(Unit, fragment_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Unit, source_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DropInPaths", "as", NULL, offsetof(Unit, dropin_paths), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UnitFileState", "s", property_get_unit_file_state, 0, 0), + SD_BUS_PROPERTY("UnitFilePreset", "s", property_get_unit_file_preset, 0, 0), + BUS_PROPERTY_DUAL_TIMESTAMP("StateChangeTimestamp", offsetof(Unit, state_change_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("InactiveExitTimestamp", offsetof(Unit, inactive_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ActiveEnterTimestamp", offsetof(Unit, active_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ActiveExitTimestamp", offsetof(Unit, active_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("InactiveEnterTimestamp", offsetof(Unit, inactive_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CanStart", "b", property_get_can_start, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanStop", "b", property_get_can_stop, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanReload", "b", property_get_can_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanIsolate", "b", property_get_can_isolate, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanClean", "as", property_get_can_clean, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanFreeze", "b", property_get_can_freeze, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Job", "(uo)", property_get_job, offsetof(Unit, job), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("StopWhenUnneeded", "b", bus_property_get_bool, offsetof(Unit, stop_when_unneeded), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RefuseManualStart", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_start), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IgnoreOnIsolate", "b", bus_property_get_bool, offsetof(Unit, ignore_on_isolate), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NeedDaemonReload", "b", property_get_need_daemon_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_timeout), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobRunningTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_running_timeout), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutAction", "s", property_get_emergency_action, offsetof(Unit, job_timeout_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("JobTimeoutRebootArgument", "s", NULL, offsetof(Unit, job_timeout_reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ConditionResult", "b", bus_property_get_bool, offsetof(Unit, condition_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AssertResult", "b", bus_property_get_bool, offsetof(Unit, assert_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("ConditionTimestamp", offsetof(Unit, condition_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_PROPERTY_DUAL_TIMESTAMP("AssertTimestamp", offsetof(Unit, assert_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Conditions", "a(sbbsi)", property_get_conditions, offsetof(Unit, conditions), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("Asserts", "a(sbbsi)", property_get_conditions, offsetof(Unit, asserts), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("LoadError", "(ss)", property_get_load_error, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Transient", "b", bus_property_get_bool, offsetof(Unit, transient), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Perpetual", "b", bus_property_get_bool, offsetof(Unit, perpetual), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FailureActionExitStatus", "i", bus_property_get_int, offsetof(Unit, failure_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessAction", "s", property_get_emergency_action, offsetof(Unit, success_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SuccessActionExitStatus", "i", bus_property_get_int, offsetof(Unit, success_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0), + + SD_BUS_METHOD_WITH_NAMES("Start", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_start, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("Stop", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_stop, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("Reload", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_reload, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("Restart", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("TryRestart", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_try_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ReloadOrRestart", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_reload_or_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("ReloadOrTryRestart", + "s", + SD_BUS_PARAM(mode), + "o", + SD_BUS_PARAM(job), + method_reload_or_try_restart, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("EnqueueJob", + "ss", + SD_BUS_PARAM(job_type) + SD_BUS_PARAM(job_mode), + "uososa(uosos)", + SD_BUS_PARAM(job_id) + SD_BUS_PARAM(job_path) + SD_BUS_PARAM(unit_id) + SD_BUS_PARAM(unit_path) + SD_BUS_PARAM(job_type) + SD_BUS_PARAM(affected_jobs), + bus_unit_method_enqueue_job, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("Kill", + "si", + SD_BUS_PARAM(whom) + SD_BUS_PARAM(signal), + NULL,, + bus_unit_method_kill, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ResetFailed", + NULL, + NULL, + bus_unit_method_reset_failed, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("SetProperties", + "ba(sv)", + SD_BUS_PARAM(runtime) + SD_BUS_PARAM(properties), + NULL,, + bus_unit_method_set_properties, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Ref", + NULL, + NULL, + bus_unit_method_ref, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unref", + NULL, + NULL, + bus_unit_method_unref, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_NAMES("Clean", + "as", + SD_BUS_PARAM(mask), + NULL,, + bus_unit_method_clean, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Freeze", + NULL, + NULL, + bus_unit_method_freeze, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Thaw", + NULL, + NULL, + bus_unit_method_thaw, + SD_BUS_VTABLE_UNPRIVILEGED), + + /* For dependency types we don't support anymore always return an empty array */ + SD_BUS_PROPERTY("RequiresOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequisiteOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequiredByOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("RequisiteOfOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN), + /* Obsolete alias names */ + SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + + SD_BUS_VTABLE_END +}; + +static int property_get_slice( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + + assert(bus); + assert(reply); + assert(u); + + return sd_bus_message_append(reply, "s", unit_slice_name(u)); +} + +static int property_get_current_memory( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t sz = (uint64_t) -1; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_memory_current(u, &sz); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get memory.usage_in_bytes attribute: %m"); + + return sd_bus_message_append(reply, "t", sz); +} + +static int property_get_current_tasks( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t cn = (uint64_t) -1; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_tasks_current(u, &cn); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get pids.current attribute: %m"); + + return sd_bus_message_append(reply, "t", cn); +} + +static int property_get_cpu_usage( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + nsec_t ns = (nsec_t) -1; + Unit *u = userdata; + int r; + + assert(bus); + assert(reply); + assert(u); + + r = unit_get_cpu_usage(u, &ns); + if (r < 0 && r != -ENODATA) + log_unit_warning_errno(u, r, "Failed to get cpuacct.usage attribute: %m"); + + return sd_bus_message_append(reply, "t", ns); +} + +static int property_get_cpuset_cpus( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + _cleanup_(cpu_set_reset) CPUSet cpus = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(u); + + (void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective"); + (void) cpu_set_to_dbus(&cpus, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_cpuset_mems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + _cleanup_(cpu_set_reset) CPUSet mems = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(u); + + (void) unit_get_cpuset(u, &mems, "cpuset.mems.effective"); + (void) cpu_set_to_dbus(&mems, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_cgroup( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + const char *t = NULL; + + assert(bus); + assert(reply); + assert(u); + + /* Three cases: a) u->cgroup_path is NULL, in which case the + * unit has no control group, which we report as the empty + * string. b) u->cgroup_path is the empty string, which + * indicates the root cgroup, which we report as "/". c) all + * other cases we report as-is. */ + + if (u->cgroup_path) + t = empty_to_root(u->cgroup_path); + + return sd_bus_message_append(reply, "s", t); +} + +static int append_process(sd_bus_message *reply, const char *p, pid_t pid, Set *pids) { + _cleanup_free_ char *buf = NULL, *cmdline = NULL; + int r; + + assert(reply); + assert(pid > 0); + + r = set_put(pids, PID_TO_PTR(pid)); + if (IN_SET(r, 0, -EEXIST)) + return 0; + if (r < 0) + return r; + + if (!p) { + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &buf); + if (r == -ESRCH) + return 0; + if (r < 0) + return r; + + p = buf; + } + + (void) get_process_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &cmdline); + + return sd_bus_message_append(reply, + "(sus)", + p, + (uint32_t) pid, + cmdline); +} + +static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(reply); + assert(p); + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, p, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + pid_t pid; + + r = cg_read_pid(f, &pid); + if (r < 0) + return r; + if (r == 0) + break; + + if (is_kernel_thread(pid) > 0) + continue; + + r = append_process(reply, p, pid, pids); + if (r < 0) + return r; + } + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, p, &d); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *g = NULL, *j = NULL; + + r = cg_read_subgroup(d, &g); + if (r < 0) + return r; + if (r == 0) + break; + + j = path_join(empty_to_root(p), g); + if (!j) + return -ENOMEM; + + r = append_cgroup(reply, j, pids); + if (r < 0) + return r; + } + + return 0; +} + +int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_set_free_ Set *pids = NULL; + Unit *u = userdata; + pid_t pid; + int r; + + assert(message); + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + pids = set_new(NULL); + if (!pids) + return -ENOMEM; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(sus)"); + if (r < 0) + return r; + + if (u->cgroup_path) { + r = append_cgroup(reply, u->cgroup_path, pids); + if (r < 0) + return r; + } + + /* The main and control pids might live outside of the cgroup, hence fetch them separately */ + pid = unit_main_pid(u); + if (pid > 0) { + r = append_process(reply, NULL, pid, pids); + if (r < 0) + return r; + } + + pid = unit_control_pid(u); + if (pid > 0) { + r = append_process(reply, NULL, pid, pids); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int property_get_ip_counter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + static const char *const table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "IPIngressBytes", + [CGROUP_IP_EGRESS_BYTES] = "IPEgressBytes", + [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets", + [CGROUP_IP_EGRESS_PACKETS] = "IPEgressPackets", + }; + + uint64_t value = UINT64_MAX; + Unit *u = userdata; + ssize_t metric; + + assert(bus); + assert(reply); + assert(property); + assert(u); + + assert_se((metric = string_table_lookup(table, ELEMENTSOF(table), property)) >= 0); + (void) unit_get_ip_accounting(u, metric, &value); + return sd_bus_message_append(reply, "t", value); +} + +static int property_get_io_counter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + static const char *const table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "IOReadBytes", + [CGROUP_IO_WRITE_BYTES] = "IOWriteBytes", + [CGROUP_IO_READ_OPERATIONS] = "IOReadOperations", + [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations", + }; + + uint64_t value = UINT64_MAX; + Unit *u = userdata; + ssize_t metric; + + assert(bus); + assert(reply); + assert(property); + assert(u); + + assert_se((metric = string_table_lookup(table, ELEMENTSOF(table), property)) >= 0); + (void) unit_get_io_accounting(u, metric, false, &value); + return sd_bus_message_append(reply, "t", value); +} + +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_set_free_ Set *pids = NULL; + Unit *u = userdata; + const char *path; + int r; + + assert(message); + + /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a + * specified cgroup path. Obviously this only works for units that actually maintain a cgroup + * representation. If a process is already in the cgroup no operation is executed – in this case the specified + * subcgroup path has no effect! */ + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + path = empty_to_null(path); + if (path) { + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path); + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path); + } + + if (!unit_cgroup_delegate(u)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units."); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing."); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + for (;;) { + uid_t process_uid, sender_uid; + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + /* Filter out duplicates */ + if (set_contains(pids, PID_TO_PTR(pid))) + continue; + + /* Check if this process is suitable for attaching to this unit */ + r = unit_pid_attachable(u, pid, error); + if (r < 0) + return r; + + /* Let's query the sender's UID, so that we can make our security decisions */ + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + return r; + + /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit, + * then the process' UID and the target unit's UID have to match the sender's UID */ + if (sender_uid != 0 && sender_uid != getuid()) { + r = get_process_uid(pid, &process_uid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m"); + + if (process_uid != sender_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid); + if (process_uid != u->ref_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid); + } + + if (!pids) { + pids = set_new(NULL); + if (!pids) + return -ENOMEM; + } + + r = set_put(pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = unit_attach_pids_to_cgroup(u, pids, path); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m"); + + return sd_bus_reply_method_return(message, NULL); +} + +const sd_bus_vtable bus_unit_cgroup_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), + SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0), + SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0), + SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), + SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0), + SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), + SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IOReadBytes", "t", property_get_io_counter, 0, 0), + SD_BUS_PROPERTY("IOReadOperations", "t", property_get_io_counter, 0, 0), + SD_BUS_PROPERTY("IOWriteBytes", "t", property_get_io_counter, 0, 0), + SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0), + + SD_BUS_METHOD_WITH_NAMES("GetProcesses", + NULL,, + "a(sus)", + SD_BUS_PARAM(processes), + bus_unit_method_get_processes, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_METHOD_WITH_NAMES("AttachProcesses", + "sau", + SD_BUS_PARAM(subcgroup) + SD_BUS_PARAM(pids), + NULL,, + bus_unit_method_attach_processes, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +static int send_new_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(u); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitNew"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "so", u->id, p); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +static int send_changed_signal(sd_bus *bus, void *userdata) { + _cleanup_free_ char *p = NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(u); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + /* Send a properties changed signal. First for the specific + * type, then for the generic unit. The clients may rely on + * this order to get atomic behavior if needed. */ + + r = sd_bus_emit_properties_changed_strv( + bus, p, + unit_dbus_interface_from_type(u->type), + NULL); + if (r < 0) + return r; + + return sd_bus_emit_properties_changed_strv( + bus, p, + "org.freedesktop.systemd1.Unit", + NULL); +} + +void bus_unit_send_change_signal(Unit *u) { + int r; + assert(u); + + if (u->in_dbus_queue) { + LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u); + u->in_dbus_queue = false; + } + + if (!u->id) + return; + + r = bus_foreach_bus(u->manager, u->bus_track, u->sent_dbus_new_signal ? send_changed_signal : send_new_signal, u); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to send unit change signal for %s: %m", u->id); + + u->sent_dbus_new_signal = true; +} + +void bus_unit_send_pending_change_signal(Unit *u, bool including_new) { + + /* Sends out any pending change signals, but only if they really are pending. This call is used when we are + * about to change state in order to force out a PropertiesChanged signal beforehand if there was one pending + * so that clients can follow the full state transition */ + + if (!u->in_dbus_queue) /* If not enqueued, don't bother */ + return; + + if (!u->sent_dbus_new_signal && !including_new) /* If the unit was never announced, don't bother, it's fine if + * the unit appears in the new state right-away (except if the + * caller explicitly asked us to send it anyway) */ + return; + + if (MANAGER_IS_RELOADING(u->manager)) /* Don't generate unnecessary PropertiesChanged signals for the same unit + * when we are reloading. */ + return; + + bus_unit_send_change_signal(u); +} + +int bus_unit_send_pending_freezer_message(Unit *u) { + int r; + + assert(u); + + if (!u->pending_freezer_message) + return 0; + + r = sd_bus_send(NULL, u->pending_freezer_message, NULL); + if (r < 0) + log_warning_errno(r, "Failed to send queued message, ignoring: %m"); + + u->pending_freezer_message = sd_bus_message_unref(u->pending_freezer_message); + + return 0; +} + +static int send_removed_signal(sd_bus *bus, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *p = NULL; + Unit *u = userdata; + int r; + + assert(bus); + assert(u); + + p = unit_dbus_path(u); + if (!p) + return -ENOMEM; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "UnitRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "so", u->id, p); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +void bus_unit_send_removed_signal(Unit *u) { + int r; + assert(u); + + if (!u->sent_dbus_new_signal || u->in_dbus_queue) + bus_unit_send_change_signal(u); + + if (!u->id) + return; + + r = bus_foreach_bus(u->manager, u->bus_track, send_removed_signal, u); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to send unit remove signal for %s: %m", u->id); +} + +int bus_unit_queue_job( + sd_bus_message *message, + Unit *u, + JobType type, + JobMode mode, + BusUnitQueueFlags flags, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *job_path = NULL, *unit_path = NULL; + _cleanup_set_free_ Set *affected = NULL; + Job *j, *a; + int r; + + assert(message); + assert(u); + assert(type >= 0 && type < _JOB_TYPE_MAX); + assert(mode >= 0 && mode < _JOB_MODE_MAX); + + r = mac_selinux_unit_access_check( + u, message, + job_type_to_access_method(type), + error); + if (r < 0) + return r; + + if (FLAGS_SET(flags, BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE) && unit_can_reload(u)) { + if (type == JOB_RESTART) + type = JOB_RELOAD_OR_START; + else if (type == JOB_TRY_RESTART) + type = JOB_TRY_RELOAD; + } + + if (type == JOB_STOP && + IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_ERROR, UNIT_BAD_SETTING) && + unit_active_state(u) == UNIT_INACTIVE) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", u->id); + + if ((type == JOB_START && u->refuse_manual_start) || + (type == JOB_STOP && u->refuse_manual_stop) || + (IN_SET(type, JOB_RESTART, JOB_TRY_RESTART) && (u->refuse_manual_start || u->refuse_manual_stop)) || + (type == JOB_RELOAD_OR_START && job_type_collapse(type, u) == JOB_START && u->refuse_manual_start)) + return sd_bus_error_setf(error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, unit %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id); + + if (FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) { + affected = set_new(NULL); + if (!affected) + return -ENOMEM; + } + + r = manager_add_job(u->manager, type, u, mode, affected, error, &j); + if (r < 0) + return r; + + r = bus_job_track_sender(j, message); + if (r < 0) + return r; + + /* Before we send the method reply, force out the announcement JobNew for this job */ + bus_job_send_pending_change_signal(j, true); + + job_path = job_dbus_path(j); + if (!job_path) + return -ENOMEM; + + /* The classic response is just a job object path */ + if (!FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) + return sd_bus_reply_method_return(message, "o", job_path); + + /* In verbose mode respond with the anchor job plus everything that has been affected */ + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + unit_path = unit_dbus_path(j->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "uosos", + j->id, job_path, + j->unit->id, unit_path, + job_type_to_string(j->type)); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(uosos)"); + if (r < 0) + return r; + + SET_FOREACH(a, affected) { + + if (a->id == j->id) + continue; + + /* Free paths from previous iteration */ + job_path = mfree(job_path); + unit_path = mfree(unit_path); + + job_path = job_dbus_path(a); + if (!job_path) + return -ENOMEM; + + unit_path = unit_dbus_path(a->unit); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(uosos)", + a->id, job_path, + a->unit->id, unit_path, + job_type_to_string(a->type)); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int bus_unit_set_live_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int r; + + assert(u); + assert(name); + assert(message); + + /* Handles setting properties both "live" (i.e. at any time during runtime), and during creation (for transient + * units that are being created). */ + + if (streq(name, "Description")) { + const char *d; + + r = sd_bus_message_read(message, "s", &d); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_set_description(u, d); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Description=%s", d); + } + + return 1; + } + + return 0; +} + +static int bus_set_transient_emergency_action( + Unit *u, + const char *name, + EmergencyAction *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *s; + EmergencyAction v; + int r; + bool system; + + assert(p); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + system = MANAGER_IS_SYSTEM(u->manager); + r = parse_emergency_action(s, system, &v); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + r == -EOPNOTSUPP ? "%s setting invalid for manager type: %s" + : "Invalid %s setting: %s", + name, s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, + "%s=%s", name, s); + } + + return 1; +} + +static int bus_set_transient_exit_status( + Unit *u, + const char *name, + int *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int32_t k; + int r; + + assert(p); + + r = sd_bus_message_read(message, "i", &k); + if (r < 0) + return r; + + if (k > 255) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Exit status must be in range 0…255 or negative."); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = k < 0 ? -1 : k; + + if (k < 0) + unit_write_settingf(u, flags, name, "%s=", name); + else + unit_write_settingf(u, flags, name, "%s=%i", name, k); + } + + return 1; +} + +static BUS_DEFINE_SET_TRANSIENT_PARSE(collect_mode, CollectMode, collect_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(job_mode, JobMode, job_mode_from_string); + +static int bus_set_transient_conditions( + Unit *u, + const char *name, + Condition **list, + bool is_condition, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *type_name, *param; + int trigger, negate, r; + bool empty = true; + + assert(list); + + r = sd_bus_message_enter_container(message, 'a', "(sbbs)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(sbbs)", &type_name, &trigger, &negate, ¶m)) > 0) { + ConditionType t; + + t = is_condition ? condition_type_from_string(type_name) : assert_type_from_string(type_name); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid condition type: %s", type_name); + + if (isempty(param)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Condition parameter in %s is empty", type_name); + + if (condition_takes_path(t) && !path_is_absolute(param)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in condition %s is not absolute: %s", type_name, param); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + Condition *c; + + c = condition_new(t, param, trigger, negate); + if (!c) + return -ENOMEM; + + LIST_PREPEND(conditions, *list, c); + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s%s%s", type_name, + trigger ? "|" : "", negate ? "!" : "", param); + } + + empty = false; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) { + *list = condition_free_list(*list); + unit_write_settingf(u, flags, name, "%sNull=", is_condition ? "Condition" : "Assert"); + } + + return 1; +} + +static int bus_unit_set_transient_property( + Unit *u, + const char *name, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + UnitDependency d = _UNIT_DEPENDENCY_INVALID; + int r; + + assert(u); + assert(name); + assert(message); + + /* Handles settings when transient units are created. This settings cannot be altered anymore after the unit + * has been created. */ + + if (streq(name, "SourcePath")) + return bus_set_transient_path(u, name, &u->source_path, message, flags, error); + + if (streq(name, "StopWhenUnneeded")) + return bus_set_transient_bool(u, name, &u->stop_when_unneeded, message, flags, error); + + if (streq(name, "RefuseManualStart")) + return bus_set_transient_bool(u, name, &u->refuse_manual_start, message, flags, error); + + if (streq(name, "RefuseManualStop")) + return bus_set_transient_bool(u, name, &u->refuse_manual_stop, message, flags, error); + + if (streq(name, "AllowIsolate")) + return bus_set_transient_bool(u, name, &u->allow_isolate, message, flags, error); + + if (streq(name, "DefaultDependencies")) + return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error); + + if (streq(name, "OnFailureJobMode")) + return bus_set_transient_job_mode(u, name, &u->on_failure_job_mode, message, flags, error); + + if (streq(name, "IgnoreOnIsolate")) + return bus_set_transient_bool(u, name, &u->ignore_on_isolate, message, flags, error); + + if (streq(name, "JobTimeoutUSec")) { + r = bus_set_transient_usec_fix_0(u, name, &u->job_timeout, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags) && !u->job_running_timeout_set) + u->job_running_timeout = u->job_timeout; + } + + if (streq(name, "JobRunningTimeoutUSec")) { + r = bus_set_transient_usec_fix_0(u, name, &u->job_running_timeout, message, flags, error); + if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) + u->job_running_timeout_set = true; + + return r; + } + + if (streq(name, "JobTimeoutAction")) + return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error); + + if (streq(name, "JobTimeoutRebootArgument")) + return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error); + + if (streq(name, "StartLimitIntervalUSec")) + return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error); + + if (streq(name, "StartLimitBurst")) + return bus_set_transient_unsigned(u, name, &u->start_ratelimit.burst, message, flags, error); + + if (streq(name, "StartLimitAction")) + return bus_set_transient_emergency_action(u, name, &u->start_limit_action, message, flags, error); + + if (streq(name, "FailureAction")) + return bus_set_transient_emergency_action(u, name, &u->failure_action, message, flags, error); + + if (streq(name, "SuccessAction")) + return bus_set_transient_emergency_action(u, name, &u->success_action, message, flags, error); + + if (streq(name, "FailureActionExitStatus")) + return bus_set_transient_exit_status(u, name, &u->failure_action_exit_status, message, flags, error); + + if (streq(name, "SuccessActionExitStatus")) + return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error); + + if (streq(name, "RebootArgument")) + return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error); + + if (streq(name, "CollectMode")) + return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error); + + if (streq(name, "Conditions")) + return bus_set_transient_conditions(u, name, &u->conditions, true, message, flags, error); + + if (streq(name, "Asserts")) + return bus_set_transient_conditions(u, name, &u->asserts, false, message, flags, error); + + if (streq(name, "Documentation")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + if (!documentation_url_is_valid(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid URL in %s: %s", name, *p); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (strv_isempty(l)) { + u->documentation = strv_free(u->documentation); + unit_write_settingf(u, flags, name, "%s=", name); + } else { + strv_extend_strv(&u->documentation, l, false); + + STRV_FOREACH(p, l) + unit_write_settingf(u, flags, name, "%s=%s", name, *p); + } + } + + return 1; + + } else if (streq(name, "Slice")) { + Unit *slice; + const char *s; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups."); + if (u->type == UNIT_SLICE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units."); + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope"); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!unit_name_is_valid(s, UNIT_NAME_PLAIN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name '%s'", s); + + /* Note that we do not dispatch the load queue here yet, as we don't want our own transient unit to be + * loaded while we are still setting it up. Or in other words, we use manager_load_unit_prepare() + * instead of manager_load_unit() on purpose, here. */ + r = manager_load_unit_prepare(u->manager, s, NULL, error, &slice); + if (r < 0) + return r; + + if (slice->type != UNIT_SLICE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit name '%s' is not a slice", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_set_slice(u, slice); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_PRIVATE, name, "Slice=%s", s); + } + + return 1; + + } else if (streq(name, "RequiresMountsFor")) { + _cleanup_strv_free_ char **l = NULL; + char **p; + + r = sd_bus_message_read_strv(message, &l); + if (r < 0) + return r; + + STRV_FOREACH(p, l) { + path_simplify(*p, true); + + if (!path_is_absolute(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not absolute: %s", name, *p); + + if (!path_is_valid(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s has invalid length: %s", name, *p); + + if (!path_is_normalized(*p)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p); + + unit_write_settingf(u, flags, name, "%s=%s", name, *p); + } + } + + return 1; + } + + if (streq(name, "RequiresOverridable")) + d = UNIT_REQUIRES; /* redirect for obsolete unit dependency type */ + else if (streq(name, "RequisiteOverridable")) + d = UNIT_REQUISITE; /* same here */ + else + d = unit_dependency_from_string(name); + + if (d >= 0) { + const char *other; + + if (!IN_SET(d, + UNIT_REQUIRES, + UNIT_REQUISITE, + UNIT_WANTS, + UNIT_BINDS_TO, + UNIT_PART_OF, + UNIT_CONFLICTS, + UNIT_BEFORE, + UNIT_AFTER, + UNIT_ON_FAILURE, + UNIT_PROPAGATES_RELOAD_TO, + UNIT_RELOAD_PROPAGATED_FROM, + UNIT_JOINS_NAMESPACE_OF)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Dependency type %s may not be created transiently.", unit_dependency_to_string(d)); + + r = sd_bus_message_enter_container(message, 'a', "s"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "s", &other)) > 0) { + if (!unit_name_is_valid(other, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name %s", other); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *label = NULL; + + r = unit_add_dependency_by_name(u, d, other, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + label = strjoin(name, "-", other); + if (!label) + return -ENOMEM; + + unit_write_settingf(u, flags, label, "%s=%s", unit_dependency_to_string(d), other); + } + + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + return 1; + + } else if (streq(name, "AddRef")) { + + int b; + + /* Why is this called "AddRef" rather than just "Ref", or "Reference"? There's already a "Ref()" method + * on the Unit interface, and it's probably not a good idea to expose a property and a method on the + * same interface (well, strictly speaking AddRef isn't exposed as full property, we just read it for + * transient units, but still). And "References" and "ReferencedBy" is already used as unit reference + * dependency type, hence let's not confuse things with that. + * + * Note that we don't actually add the reference to the bus track. We do that only after the setup of + * the transient unit is complete, so that setting this property multiple times in the same transient + * unit creation call doesn't count as individual references. */ + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) + u->bus_track_add = b; + + return 1; + } + + return 0; +} + +int bus_unit_set_properties( + Unit *u, + sd_bus_message *message, + UnitWriteFlags flags, + bool commit, + sd_bus_error *error) { + + bool for_real = false; + unsigned n = 0; + int r; + + assert(u); + assert(message); + + /* We iterate through the array twice. First run we just check + * if all passed data is valid, second run actually applies + * it. This is to implement transaction-like behaviour without + * actually providing full transactions. */ + + r = sd_bus_message_enter_container(message, 'a', "(sv)"); + if (r < 0) + return r; + + for (;;) { + const char *name; + UnitWriteFlags f; + + r = sd_bus_message_enter_container(message, 'r', "sv"); + if (r < 0) + return r; + if (r == 0) { + if (for_real || UNIT_WRITE_FLAGS_NOOP(flags)) + break; + + /* Reached EOF. Let's try again, and this time for realz... */ + r = sd_bus_message_rewind(message, false); + if (r < 0) + return r; + + for_real = true; + continue; + } + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (!UNIT_VTABLE(u)->bus_set_property) + return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY, "Objects of this type do not support setting properties."); + + r = sd_bus_message_enter_container(message, 'v', NULL); + if (r < 0) + return r; + + /* If not for real, then mask out the two target flags */ + f = for_real ? flags : (flags & ~(UNIT_RUNTIME|UNIT_PERSISTENT)); + + r = UNIT_VTABLE(u)->bus_set_property(u, name, message, f, error); + if (r == 0 && u->transient && u->load_state == UNIT_STUB) + r = bus_unit_set_transient_property(u, name, message, f, error); + if (r == 0) + r = bus_unit_set_live_property(u, name, message, f, error); + if (r < 0) + return r; + + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY, "Cannot set property %s, or unknown property.", name); + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n += for_real; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (commit && n > 0 && UNIT_VTABLE(u)->bus_commit_properties) + UNIT_VTABLE(u)->bus_commit_properties(u); + + return n; +} + +int bus_unit_validate_load_state(Unit *u, sd_bus_error *error) { + assert(u); + + /* Generates a pretty error if a unit isn't properly loaded. */ + + switch (u->load_state) { + + case UNIT_LOADED: + return 0; + + case UNIT_NOT_FOUND: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not found.", u->id); + + case UNIT_BAD_SETTING: + return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, "Unit %s has a bad unit file setting.", u->id); + + case UNIT_ERROR: /* Only show .load_error in UNIT_ERROR state */ + return sd_bus_error_set_errnof(error, u->load_error, "Unit %s failed to load properly: %m.", u->id); + + case UNIT_MASKED: + return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit %s is masked.", u->id); + + case UNIT_STUB: + case UNIT_MERGED: + default: + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unexpected load state of unit %s", u->id); + } +} + +static int bus_unit_track_handler(sd_bus_track *t, void *userdata) { + Unit *u = userdata; + + assert(t); + assert(u); + + u->bus_track = sd_bus_track_unref(u->bus_track); /* make sure we aren't called again */ + + /* If the client that tracks us disappeared, then there's reason to believe that the cgroup is empty now too, + * let's see */ + unit_add_to_cgroup_empty_queue(u); + + /* Also add the unit to the GC queue, after all if the client left it might be time to GC this unit */ + unit_add_to_gc_queue(u); + + return 0; +} + +static int bus_unit_allocate_bus_track(Unit *u) { + int r; + + assert(u); + + if (u->bus_track) + return 0; + + r = sd_bus_track_new(u->manager->api_bus, &u->bus_track, bus_unit_track_handler, u); + if (r < 0) + return r; + + r = sd_bus_track_set_recursive(u->bus_track, true); + if (r < 0) { + u->bus_track = sd_bus_track_unref(u->bus_track); + return r; + } + + return 0; +} + +int bus_unit_track_add_name(Unit *u, const char *name) { + int r; + + assert(u); + + r = bus_unit_allocate_bus_track(u); + if (r < 0) + return r; + + return sd_bus_track_add_name(u->bus_track, name); +} + +int bus_unit_track_add_sender(Unit *u, sd_bus_message *m) { + int r; + + assert(u); + + r = bus_unit_allocate_bus_track(u); + if (r < 0) + return r; + + return sd_bus_track_add_sender(u->bus_track, m); +} + +int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m) { + assert(u); + + /* If we haven't allocated the bus track object yet, then there's definitely no reference taken yet, return an + * error */ + if (!u->bus_track) + return -EUNATCH; + + return sd_bus_track_remove_sender(u->bus_track, m); +} diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h new file mode 100644 index 0000000..1da3cfe --- /dev/null +++ b/src/core/dbus-unit.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "unit.h" + +extern const sd_bus_vtable bus_unit_vtable[]; +extern const sd_bus_vtable bus_unit_cgroup_vtable[]; + +void bus_unit_send_change_signal(Unit *u); +void bus_unit_send_pending_change_signal(Unit *u, bool including_new); +int bus_unit_send_pending_freezer_message(Unit *u); +void bus_unit_send_removed_signal(Unit *u); + +int bus_unit_method_start_generic(sd_bus_message *message, Unit *u, JobType job_type, bool reload_if_possible, sd_bus_error *error); +int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error); +int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error); + +typedef enum BusUnitQueueFlags { + BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE = 1 << 0, + BUS_UNIT_QUEUE_VERBOSE_REPLY = 1 << 1, +} BusUnitQueueFlags; + +int bus_unit_queue_job(sd_bus_message *message, Unit *u, JobType type, JobMode mode, BusUnitQueueFlags flags, sd_bus_error *error); +int bus_unit_validate_load_state(Unit *u, sd_bus_error *error); + +int bus_unit_track_add_name(Unit *u, const char *name); +int bus_unit_track_add_sender(Unit *u, sd_bus_message *m); +int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m); diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c new file mode 100644 index 0000000..d6223db --- /dev/null +++ b/src/core/dbus-util.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-util.h" +#include "dbus-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "unit-printf.h" +#include "user-util.h" +#include "unit.h" + +int bus_property_get_triggered_unit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata, *trigger; + + assert(bus); + assert(reply); + assert(u); + + trigger = UNIT_TRIGGER(u); + + return sd_bus_message_append(reply, "s", trigger ? trigger->id : NULL); +} + +BUS_DEFINE_SET_TRANSIENT(mode_t, "u", uint32_t, mode_t, "%040o"); +BUS_DEFINE_SET_TRANSIENT(unsigned, "u", uint32_t, unsigned, "%" PRIu32); + +static inline bool valid_user_group_name_or_id_relaxed(const char *u) { + return valid_user_group_name(u, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX); +} + +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed); +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute); + +int bus_set_transient_string( + Unit *u, + const char *name, + char **p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = free_and_strdup(p, empty_to_null(v)); + if (r < 0) + return r; + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "%s=%s", name, strempty(v)); + } + + return 1; +} + +int bus_set_transient_bool( + Unit *u, + const char *name, + bool *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int v, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v; + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v)); + } + + return 1; +} + +int bus_set_transient_percent( + Unit *u, + const char *name, + int *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + r = parse_percent(v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = r; + unit_write_settingf(u, flags, name, "%s=%d%%", name, r); + } + + return 1; +} + +int bus_set_transient_usec_internal( + Unit *u, + const char *name, + usec_t *p, + bool fix_0, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + uint64_t v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "t", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + char *n, ts[FORMAT_TIMESPAN_MAX]; + + if (fix_0) + *p = v != 0 ? v: USEC_INFINITY; + else + *p = v; + + n = strndupa(name, strlen(name) - 4); + unit_write_settingf(u, flags, name, "%sSec=%s", n, + format_timespan(ts, sizeof(ts), v, USEC_PER_MSEC)); + } + + return 1; +} diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h new file mode 100644 index 0000000..4e7c68e --- /dev/null +++ b/src/core/dbus-util.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "unit.h" + +int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +#define BUS_DEFINE_SET_TRANSIENT(function, bus_type, type, cast_type, fmt) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=" fmt, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_IS_VALID(function, bus_type, type, cast_type, fmt, check) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + if (!check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=" fmt, name, v); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_TO_STRING(function, bus_type, type, cast_type, fmt, to_string) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + s = to_string(v); \ + if (!s) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(function, bus_type, type, cast_type, fmt, to_string) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + cast_type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + _cleanup_free_ char *s = NULL; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, bus_type, &v); \ + if (r < 0) \ + return r; \ + \ + r = to_string(v, &s); \ + if (r == -EINVAL) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: " fmt, name, v); \ + if (r < 0) \ + return r; \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = (cast_type) v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", \ + name, strempty(s)); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_PARSE(function, type, parse) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &s); \ + if (r < 0) \ + return r; \ + \ + v = parse(s); \ + if (v < 0) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, s); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, s); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(function, type, parse) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + type *p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *s; \ + type v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &s); \ + if (r < 0) \ + return r; \ + \ + r = parse(s, &v); \ + if (r < 0) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, s); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + *p = v; \ + unit_write_settingf(u, flags, name, \ + "%s=%s", name, strempty(s)); \ + } \ + \ + return 1; \ + } + +#define BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(function, check) \ + int bus_set_transient_##function( \ + Unit *u, \ + const char *name, \ + char **p, \ + sd_bus_message *message, \ + UnitWriteFlags flags, \ + sd_bus_error *error) { \ + \ + const char *v; \ + int r; \ + \ + assert(p); \ + \ + r = sd_bus_message_read(message, "s", &v); \ + if (r < 0) \ + return r; \ + \ + if (!isempty(v) && !check(v)) \ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \ + "Invalid %s setting: %s", name, v); \ + \ + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \ + r = free_and_strdup(p, empty_to_null(v)); \ + if (r < 0) \ + return r; \ + \ + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, \ + "%s=%s", name, strempty(v)); \ + } \ + \ + return 1; \ + } + +int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_percent(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); +} +static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, true, message, flags, error); +} diff --git a/src/core/dbus.c b/src/core/dbus.c new file mode 100644 index 0000000..3e435c9 --- /dev/null +++ b/src/core/dbus.c @@ -0,0 +1,1250 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <sys/epoll.h> +#include <unistd.h> + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "dbus-automount.h" +#include "dbus-cgroup.h" +#include "dbus-device.h" +#include "dbus-execute.h" +#include "dbus-job.h" +#include "dbus-kill.h" +#include "dbus-manager.h" +#include "dbus-mount.h" +#include "dbus-path.h" +#include "dbus-scope.h" +#include "dbus-service.h" +#include "dbus-slice.h" +#include "dbus-socket.h" +#include "dbus-swap.h" +#include "dbus-target.h" +#include "dbus-timer.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "mkdir.h" +#include "process-util.h" +#include "selinux-access.h" +#include "serialize.h" +#include "service.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "user-util.h" + +#define CONNECTIONS_MAX 4096 + +static void destroy_bus(Manager *m, sd_bus **bus); + +int bus_send_pending_reload_message(Manager *m) { + int r; + + assert(m); + + if (!m->pending_reload_message) + return 0; + + /* If we cannot get rid of this message we won't dispatch any D-Bus messages, so that we won't end up wanting + * to queue another message. */ + + r = sd_bus_send(NULL, m->pending_reload_message, NULL); + if (r < 0) + log_warning_errno(r, "Failed to send queued message, ignoring: %m"); + + m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message); + + return 0; +} + +int bus_forward_agent_released(Manager *m, const char *path) { + int r; + + assert(m); + assert(path); + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (!m->system_bus) + return 0; + + /* If we are running a system instance we forward the agent message on the system bus, so that the user + * instances get notified about this, too */ + + r = sd_bus_emit_signal(m->system_bus, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + "s", path); + if (r < 0) + return log_debug_errno(r, "Failed to propagate agent release message: %m"); + + return 1; +} + +static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Manager *m = userdata; + const char *cgroup; + uid_t sender_uid; + int r; + + assert(message); + assert(m); + + /* only accept org.freedesktop.systemd1.Agent from UID=0 */ + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0 || sender_uid != 0) + return 0; + + /* parse 'cgroup-empty' notification */ + r = sd_bus_message_read(message, "s", &cgroup); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + manager_notify_cgroup_empty(m, cgroup); + return 0; +} + +static int signal_disconnected(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + sd_bus *bus; + + assert(message); + assert(m); + assert_se(bus = sd_bus_message_get_bus(message)); + + if (bus == m->api_bus) + bus_done_api(m); + if (bus == m->system_bus) + bus_done_system(m); + + if (set_remove(m->private_buses, bus)) { + log_debug("Got disconnect on private connection."); + destroy_bus(m, &bus); + } + + return 0; +} + +static int signal_activation_request(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SERVICE) || + manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SOCKET)) { + r = sd_bus_error_setf(&error, BUS_ERROR_SHUTTING_DOWN, "Refusing activation, D-Bus is shutting down."); + goto failed; + } + + r = manager_load_unit(m, name, NULL, &error, &u); + if (r < 0) + goto failed; + + if (u->refuse_manual_start) { + r = sd_bus_error_setf(&error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id); + goto failed; + } + + r = manager_add_job(m, JOB_START, u, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto failed; + + /* Successfully queued, that's it for us */ + return 0; + +failed: + if (!sd_bus_error_is_set(&error)) + sd_bus_error_set_errno(&error, r); + + log_debug("D-Bus activation failed for %s: %s", name, bus_error_message(&error, r)); + + r = sd_bus_message_new_signal(sd_bus_message_get_bus(message), &reply, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Activator", "ActivationFailure"); + if (r < 0) { + bus_log_create_error(r); + return 0; + } + + r = sd_bus_message_append(reply, "sss", name, error.name, error.message); + if (r < 0) { + bus_log_create_error(r); + return 0; + } + + r = sd_bus_send_to(NULL, reply, "org.freedesktop.DBus", NULL); + if (r < 0) + return log_error_errno(r, "Failed to respond with to bus activation request: %m"); + + return 0; +} + +#if HAVE_SELINUX +static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *verb, *path; + Unit *u = NULL; + Job *j; + int r; + + assert(message); + + /* Our own method calls are all protected individually with + * selinux checks, but the built-in interfaces need to be + * protected too. */ + + if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", "Set")) + verb = "reload"; + else if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Introspectable", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.ObjectManager", NULL) || + sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Peer", NULL)) + verb = "status"; + else + return 0; + + path = sd_bus_message_get_path(message); + + if (object_path_startswith("/org/freedesktop/systemd1", path)) { + r = mac_selinux_access_check(message, verb, error); + if (r < 0) + return r; + + return 0; + } + + if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return 0; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return 0; + + u = manager_get_unit_by_pid(m, pid); + } else { + r = manager_get_job_from_dbus_path(m, path, &j); + if (r >= 0) + u = j->unit; + else + manager_load_unit_from_dbus_path(m, path, NULL, &u); + } + if (!u) + return 0; + + r = mac_selinux_unit_access_check(u, message, verb, error); + if (r < 0) + return r; + + return 0; +} +#endif + +static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_bus_error *error) { + Unit *u = NULL; /* just to appease gcc, initialization is not really necessary */ + int r; + + assert(m); + assert(bus); + assert(path); + + if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + sd_bus_message *message; + pid_t pid; + + message = sd_bus_get_current_message(bus); + if (!message) + return 0; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return 0; + } else { + r = manager_load_unit_from_dbus_path(m, path, error, &u); + if (r < 0) + return 0; + assert(u); + } + + *unit = u; + return 1; +} + +static int bus_unit_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + return find_unit(m, bus, path, (Unit**) found, error); +} + +static int bus_unit_interface_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + *found = u; + return 1; +} + +static int bus_unit_cgroup_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + *found = u; + return 1; +} + +static int bus_cgroup_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + CGroupContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_exec_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + ExecContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_exec_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_kill_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = userdata; + KillContext *c; + Unit *u; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + assert(m); + + r = find_unit(m, bus, path, &u, error); + if (r <= 0) + return r; + + if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type))) + return 0; + + c = unit_get_kill_context(u); + if (!c) + return 0; + + *found = c; + return 1; +} + +static int bus_unit_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + unsigned k = 0; + Unit *u; + + l = new0(char*, hashmap_size(m->units)+1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(u, m->units) { + l[k] = unit_dbus_path(u); + if (!l[k]) + return -ENOMEM; + + k++; + } + + *nodes = TAKE_PTR(l); + + return k; +} + +static const BusObjectImplementation unit_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Unit", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_unit_vtable, bus_unit_find }), + .node_enumerator = bus_unit_enumerate, +}; + +static const BusObjectImplementation bus_automount_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Automount", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_automount_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_device_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Device", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_device_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_mount_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Mount", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_mount_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_path_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Path", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_path_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_scope_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Scope", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_scope_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_service_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Service", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_service_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_slice_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Slice", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_slice_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }), +}; + +static const BusObjectImplementation bus_socket_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Socket", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_socket_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_swap_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Swap", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_swap_vtable, bus_unit_interface_find }, + { bus_unit_cgroup_vtable, bus_unit_cgroup_find }, + { bus_cgroup_vtable, bus_cgroup_context_find }, + { bus_exec_vtable, bus_exec_context_find }, + { bus_kill_vtable, bus_kill_context_find }), +}; + +static const BusObjectImplementation bus_target_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Target", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_target_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_timer_object = { + "/org/freedesktop/systemd1/unit", + "org.freedesktop.systemd1.Timer", + .fallback_vtables = BUS_FALLBACK_VTABLES( + { bus_timer_vtable, bus_unit_interface_find }), +}; + +static const BusObjectImplementation bus_manager_object = { + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + .vtables = BUS_VTABLES(bus_manager_vtable), + .children = BUS_IMPLEMENTATIONS( + &job_object, + &unit_object, + &bus_automount_object, + &bus_device_object, + &bus_mount_object, + &bus_path_object, + &bus_scope_object, + &bus_service_object, + &bus_slice_object, + &bus_socket_object, + &bus_swap_object, + &bus_target_object, + &bus_timer_object), +}; + +static const BusObjectImplementation manager_log_control_object = { + "/org/freedesktop/LogControl1", + "org.freedesktop.LogControl1", + .vtables = BUS_VTABLES(bus_manager_log_control_vtable), +}; + +int bus_manager_introspect_implementations(FILE *out, const char *pattern) { + return bus_introspect_implementations( + out, + pattern, + BUS_IMPLEMENTATIONS(&bus_manager_object, + &manager_log_control_object)); +} + +static int bus_setup_api_vtables(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + +#if HAVE_SELINUX + r = sd_bus_add_filter(bus, NULL, mac_selinux_filter, m); + if (r < 0) + return log_error_errno(r, "Failed to add SELinux access filter: %m"); +#endif + + r = bus_add_implementation(bus, &bus_manager_object, m); + if (r < 0) + return r; + + return bus_add_implementation(bus, &manager_log_control_object, m); +} + +static int bus_setup_disconnected_match(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.DBus.Local", + "/org/freedesktop/DBus/Local", + "org.freedesktop.DBus.Local", + "Disconnected", + signal_disconnected, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for Disconnected message: %m"); + + return 0; +} + +static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int nfd = -1; + Manager *m = userdata; + sd_id128_t id; + int r; + + assert(s); + assert(m); + + nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (nfd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + log_warning_errno(errno, "Failed to accept private connection, ignoring: %m"); + return 0; + } + + if (set_size(m->private_buses) >= CONNECTIONS_MAX) { + log_warning("Too many concurrent connections, refusing"); + return 0; + } + + r = set_ensure_allocated(&m->private_buses, NULL); + if (r < 0) { + log_oom(); + return 0; + } + + r = sd_bus_new(&bus); + if (r < 0) { + log_warning_errno(r, "Failed to allocate new private connection bus: %m"); + return 0; + } + + (void) sd_bus_set_description(bus, "private-bus-connection"); + + r = sd_bus_set_fd(bus, nfd, nfd); + if (r < 0) { + log_warning_errno(r, "Failed to set fd on new connection bus: %m"); + return 0; + } + + nfd = -1; + + r = bus_check_peercred(bus); + if (r < 0) { + log_warning_errno(r, "Incoming private connection from unprivileged client, refusing: %m"); + return 0; + } + + assert_se(sd_id128_randomize(&id) >= 0); + + r = sd_bus_set_server(bus, 1, id); + if (r < 0) { + log_warning_errno(r, "Failed to enable server support for new connection bus: %m"); + return 0; + } + + r = sd_bus_negotiate_creds(bus, 1, + SD_BUS_CREDS_PID|SD_BUS_CREDS_UID| + SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT); + if (r < 0) { + log_warning_errno(r, "Failed to enable credentials for new connection: %m"); + return 0; + } + + r = sd_bus_set_sender(bus, "org.freedesktop.systemd1"); + if (r < 0) { + log_warning_errno(r, "Failed to set direct connection sender: %m"); + return 0; + } + + r = sd_bus_start(bus); + if (r < 0) { + log_warning_errno(r, "Failed to start new connection bus: %m"); + return 0; + } + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) { + log_warning_errno(r, "Failed to attach new connection bus to event loop: %m"); + return 0; + } + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return 0; + + r = bus_setup_api_vtables(m, bus); + if (r < 0) { + log_warning_errno(r, "Failed to set up API vtables on new connection bus: %m"); + return 0; + } + + r = set_put(m->private_buses, bus); + if (r < 0) { + log_warning_errno(r, "Failed to add new connection bus to set: %m"); + return 0; + } + + bus = NULL; + + log_debug("Accepted new private connection."); + + return 0; +} + +static int bus_setup_api(Manager *m, sd_bus *bus) { + char *name; + Unit *u; + int r; + + assert(m); + assert(bus); + + /* Let's make sure we have enough credential bits so that we can make security and selinux decisions */ + r = sd_bus_negotiate_creds(bus, 1, + SD_BUS_CREDS_PID|SD_BUS_CREDS_UID| + SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT); + if (r < 0) + log_warning_errno(r, "Failed to enable credential passing, ignoring: %m"); + + r = bus_setup_api_vtables(m, bus); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(u, name, m->watch_bus) { + r = unit_install_bus_match(u, bus, name); + if (r < 0) + log_error_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name); + } + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.systemd1.Activator", + "ActivationRequest", + signal_activation_request, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to subscribe to activation signal: %m"); + + /* Allow replacing of our name, to ease implementation of reexecution, where we keep the old connection open + * until after the new connection is set up and the name installed to allow clients to synchronously wait for + * reexecution to finish */ + r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.systemd1", SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_ALLOW_REPLACEMENT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + log_debug("Successfully connected to API bus."); + + return 0; +} + +int bus_init_api(Manager *m) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + if (m->api_bus) + return 0; + + /* The API and system bus is the same if we are running in system mode */ + if (MANAGER_IS_SYSTEM(m) && m->system_bus) + bus = sd_bus_ref(m->system_bus); + else { + if (MANAGER_IS_SYSTEM(m)) + r = sd_bus_open_system_with_description(&bus, "bus-api-system"); + else + r = sd_bus_open_user_with_description(&bus, "bus-api-user"); + if (r < 0) + return log_error_errno(r, "Failed to connect to API bus: %m"); + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach API bus to event loop: %m"); + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return r; + } + + r = bus_setup_api(m, bus); + if (r < 0) + return log_error_errno(r, "Failed to set up API bus: %m"); + + m->api_bus = TAKE_PTR(bus); + + return 0; +} + +static int bus_setup_system(Manager *m, sd_bus *bus) { + int r; + + assert(m); + assert(bus); + + /* if we are a user instance we get the Released message via the system bus */ + if (MANAGER_IS_USER(m)) { + r = sd_bus_match_signal_async( + bus, + NULL, + NULL, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + signal_agent_released, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to request Released match on system bus: %m"); + } + + log_debug("Successfully connected to system bus."); + return 0; +} + +int bus_init_system(Manager *m) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + if (m->system_bus) + return 0; + + /* The API and system bus is the same if we are running in system mode */ + if (MANAGER_IS_SYSTEM(m) && m->api_bus) + bus = sd_bus_ref(m->api_bus); + else { + r = sd_bus_open_system_with_description(&bus, "bus-system"); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach system bus to event loop: %m"); + + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return r; + } + + r = bus_setup_system(m, bus); + if (r < 0) + return log_error_errno(r, "Failed to set up system bus: %m"); + + m->system_bus = TAKE_PTR(bus); + + return 0; +} + +int bus_init_private(Manager *m) { + _cleanup_close_ int fd = -1; + union sockaddr_union sa; + socklen_t sa_len; + sd_event_source *s; + int r; + + assert(m); + + if (m->private_listen_fd >= 0) + return 0; + + if (MANAGER_IS_SYSTEM(m)) { + + /* We want the private bus only when running as init */ + if (getpid_cached() != 1) + return 0; + + r = sockaddr_un_set_path(&sa.un, "/run/systemd/private"); + } else { + const char *e, *joined; + + e = secure_getenv("XDG_RUNTIME_DIR"); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "XDG_RUNTIME_DIR is not set, refusing."); + + joined = strjoina(e, "/systemd/private"); + r = sockaddr_un_set_path(&sa.un, joined); + } + if (r < 0) + return log_error_errno(r, "Can't set path for AF_UNIX socket to bind to: %m"); + sa_len = r; + + (void) mkdir_parents_label(sa.un.sun_path, 0755); + (void) sockaddr_un_unlink(&sa.un); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate private socket: %m"); + + r = bind(fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "Failed to bind private socket: %m"); + + r = listen(fd, SOMAXCONN); + if (r < 0) + return log_error_errno(errno, "Failed to make private socket listening: %m"); + + /* Generate an inotify event in case somebody waits for this socket to appear using inotify() */ + (void) touch(sa.un.sun_path); + + r = sd_event_add_io(m->event, &s, fd, EPOLLIN, bus_on_connection, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source: %m"); + + (void) sd_event_source_set_description(s, "bus-connection"); + + m->private_listen_fd = TAKE_FD(fd); + m->private_listen_event_source = s; + + log_debug("Successfully created private D-Bus server."); + + return 0; +} + +static void destroy_bus(Manager *m, sd_bus **bus) { + Unit *u; + Job *j; + + assert(m); + assert(bus); + + if (!*bus) + return; + + /* Make sure all bus slots watching names are released. */ + HASHMAP_FOREACH(u, m->watch_bus) { + if (u->match_bus_slot && sd_bus_slot_get_bus(u->match_bus_slot) == *bus) + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + if (u->get_name_owner_slot && sd_bus_slot_get_bus(u->get_name_owner_slot) == *bus) + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); + } + + /* Get rid of tracked clients on this bus */ + if (m->subscribed && sd_bus_track_get_bus(m->subscribed) == *bus) + m->subscribed = sd_bus_track_unref(m->subscribed); + + HASHMAP_FOREACH(j, m->jobs) + if (j->bus_track && sd_bus_track_get_bus(j->bus_track) == *bus) + j->bus_track = sd_bus_track_unref(j->bus_track); + + HASHMAP_FOREACH(u, m->units) { + if (u->bus_track && sd_bus_track_get_bus(u->bus_track) == *bus) + u->bus_track = sd_bus_track_unref(u->bus_track); + + /* Get rid of pending freezer messages on this bus */ + if (u->pending_freezer_message && sd_bus_message_get_bus(u->pending_freezer_message) == *bus) + u->pending_freezer_message = sd_bus_message_unref(u->pending_freezer_message); + } + + /* Get rid of queued message on this bus */ + if (m->pending_reload_message && sd_bus_message_get_bus(m->pending_reload_message) == *bus) + m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message); + + /* Possibly flush unwritten data, but only if we are + * unprivileged, since we don't want to sync here */ + if (!MANAGER_IS_SYSTEM(m)) + sd_bus_flush(*bus); + + /* And destroy the object */ + *bus = sd_bus_close_unref(*bus); +} + +void bus_done_api(Manager *m) { + destroy_bus(m, &m->api_bus); +} + +void bus_done_system(Manager *m) { + destroy_bus(m, &m->system_bus); +} + +void bus_done_private(Manager *m) { + sd_bus *b; + + assert(m); + + while ((b = set_steal_first(m->private_buses))) + destroy_bus(m, &b); + + m->private_buses = set_free(m->private_buses); + + m->private_listen_event_source = sd_event_source_unref(m->private_listen_event_source); + m->private_listen_fd = safe_close(m->private_listen_fd); +} + +void bus_done(Manager *m) { + assert(m); + + bus_done_api(m); + bus_done_system(m); + bus_done_private(m); + + assert(!m->subscribed); + + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + bus_verify_polkit_async_registry_free(m->polkit_registry); +} + +int bus_fdset_add_all(Manager *m, FDSet *fds) { + sd_bus *b; + int fd; + + assert(m); + assert(fds); + + /* When we are about to reexecute we add all D-Bus fds to the + * set to pass over to the newly executed systemd. They won't + * be used there however, except thatt they are closed at the + * very end of deserialization, those making it possible for + * clients to synchronously wait for systemd to reexec by + * simply waiting for disconnection */ + + if (m->api_bus) { + fd = sd_bus_get_fd(m->api_bus); + if (fd >= 0) { + fd = fdset_put_dup(fds, fd); + if (fd < 0) + return fd; + } + } + + SET_FOREACH(b, m->private_buses) { + fd = sd_bus_get_fd(b); + if (fd >= 0) { + fd = fdset_put_dup(fds, fd); + if (fd < 0) + return fd; + } + } + + /* We don't offer any APIs on the system bus (well, unless it + * is the same as the API bus) hence we don't bother with it + * here */ + + return 0; +} + +int bus_foreach_bus( + Manager *m, + sd_bus_track *subscribed2, + int (*send_message)(sd_bus *bus, void *userdata), + void *userdata) { + + sd_bus *b; + int r, ret = 0; + + /* Send to all direct buses, unconditionally */ + SET_FOREACH(b, m->private_buses) { + + /* Don't bother with enqueuing these messages to clients that haven't started yet */ + if (sd_bus_is_ready(b) <= 0) + continue; + + r = send_message(b, userdata); + if (r < 0) + ret = r; + } + + /* Send to API bus, but only if somebody is subscribed */ + if (m->api_bus && + (sd_bus_track_count(m->subscribed) > 0 || + sd_bus_track_count(subscribed2) > 0)) { + r = send_message(m->api_bus, userdata); + if (r < 0) + ret = r; + } + + return ret; +} + +void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) { + const char *n; + + assert(f); + assert(prefix); + + for (n = sd_bus_track_first(t); n; n = sd_bus_track_next(t)) { + int c, j; + + c = sd_bus_track_count_name(t, n); + for (j = 0; j < c; j++) + (void) serialize_item(f, prefix, n); + } +} + +int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) { + int r; + + assert(m); + assert(t); + + if (strv_isempty(l)) + return 0; + + if (!m->api_bus) + return 0; + + if (!*t) { + r = sd_bus_track_new(m->api_bus, t, NULL, NULL); + if (r < 0) + return r; + } + + r = sd_bus_track_set_recursive(*t, recursive); + if (r < 0) + return r; + + return bus_track_add_name_many(*t, l); +} + +int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { + return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error); +} + +uint64_t manager_bus_n_queued_write(Manager *m) { + uint64_t c = 0; + sd_bus *b; + int r; + + /* Returns the total number of messages queued for writing on all our direct and API buses. */ + + SET_FOREACH(b, m->private_buses) { + uint64_t k; + + r = sd_bus_get_n_queued_write(b, &k); + if (r < 0) + log_debug_errno(r, "Failed to query queued messages for private bus: %m"); + else + c += k; + } + + if (m->api_bus) { + uint64_t k; + + r = sd_bus_get_n_queued_write(m->api_bus, &k); + if (r < 0) + log_debug_errno(r, "Failed to query queued messages for API bus: %m"); + else + c += k; + } + + return c; +} + +static void vtable_dump_bus_properties(FILE *f, const sd_bus_vtable *table) { + const sd_bus_vtable *i; + + for (i = table; i->type != _SD_BUS_VTABLE_END; i++) { + if (!IN_SET(i->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY) || + (i->flags & (SD_BUS_VTABLE_DEPRECATED | SD_BUS_VTABLE_HIDDEN)) != 0) + continue; + + fprintf(f, "%s\n", i->x.property.member); + } +} + +void dump_bus_properties(FILE *f) { + assert(f); + + vtable_dump_bus_properties(f, bus_automount_vtable); + vtable_dump_bus_properties(f, bus_cgroup_vtable); + vtable_dump_bus_properties(f, bus_device_vtable); + vtable_dump_bus_properties(f, bus_exec_vtable); + vtable_dump_bus_properties(f, bus_job_vtable); + vtable_dump_bus_properties(f, bus_kill_vtable); + vtable_dump_bus_properties(f, bus_manager_vtable); + vtable_dump_bus_properties(f, bus_mount_vtable); + vtable_dump_bus_properties(f, bus_path_vtable); + vtable_dump_bus_properties(f, bus_scope_vtable); + vtable_dump_bus_properties(f, bus_service_vtable); + vtable_dump_bus_properties(f, bus_slice_vtable); + vtable_dump_bus_properties(f, bus_socket_vtable); + vtable_dump_bus_properties(f, bus_swap_vtable); + vtable_dump_bus_properties(f, bus_target_vtable); + vtable_dump_bus_properties(f, bus_timer_vtable); + vtable_dump_bus_properties(f, bus_unit_vtable); + vtable_dump_bus_properties(f, bus_unit_cgroup_vtable); +} diff --git a/src/core/dbus.h b/src/core/dbus.h new file mode 100644 index 0000000..369d9f5 --- /dev/null +++ b/src/core/dbus.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "manager.h" + +int bus_send_pending_reload_message(Manager *m); + +int bus_init_private(Manager *m); +int bus_init_api(Manager *m); +int bus_init_system(Manager *m); + +void bus_done_private(Manager *m); +void bus_done_api(Manager *m); +void bus_done_system(Manager *m); +void bus_done(Manager *m); + +int bus_fdset_add_all(Manager *m, FDSet *fds); + +void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix); +int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l); + +int bus_foreach_bus(Manager *m, sd_bus_track *subscribed2, int (*send_message)(sd_bus *bus, void *userdata), void *userdata); + +int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); +int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error); + +int bus_forward_agent_released(Manager *m, const char *path); + +uint64_t manager_bus_n_queued_write(Manager *m); + +void dump_bus_properties(FILE *f); +int bus_manager_introspect_implementations(FILE *out, const char *pattern); diff --git a/src/core/device.c b/src/core/device.c new file mode 100644 index 0000000..9a1d882 --- /dev/null +++ b/src/core/device.c @@ -0,0 +1,1121 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <sys/epoll.h> + +#include "alloc-util.h" +#include "bus-error.h" +#include "dbus-device.h" +#include "dbus-unit.h" +#include "device-private.h" +#include "device-util.h" +#include "device.h" +#include "log.h" +#include "parse-util.h" +#include "path-util.h" +#include "serialize.h" +#include "stat-util.h" +#include "string-util.h" +#include "swap.h" +#include "udev-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_DEVICE_STATE_MAX] = { + [DEVICE_DEAD] = UNIT_INACTIVE, + [DEVICE_TENTATIVE] = UNIT_ACTIVATING, + [DEVICE_PLUGGED] = UNIT_ACTIVE, +}; + +static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata); +static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask); + +static void device_unset_sysfs(Device *d) { + Hashmap *devices; + Device *first; + + assert(d); + + if (!d->sysfs) + return; + + /* Remove this unit from the chain of devices which share the + * same sysfs path. */ + devices = UNIT(d)->manager->devices_by_sysfs; + first = hashmap_get(devices, d->sysfs); + LIST_REMOVE(same_sysfs, first, d); + + if (first) + hashmap_remove_and_replace(devices, d->sysfs, first->sysfs, first); + else + hashmap_remove(devices, d->sysfs); + + d->sysfs = mfree(d->sysfs); +} + +static int device_set_sysfs(Device *d, const char *sysfs) { + _cleanup_free_ char *copy = NULL; + Device *first; + int r; + + assert(d); + + if (streq_ptr(d->sysfs, sysfs)) + return 0; + + r = hashmap_ensure_allocated(&UNIT(d)->manager->devices_by_sysfs, &path_hash_ops); + if (r < 0) + return r; + + copy = strdup(sysfs); + if (!copy) + return -ENOMEM; + + device_unset_sysfs(d); + + first = hashmap_get(UNIT(d)->manager->devices_by_sysfs, sysfs); + LIST_PREPEND(same_sysfs, first, d); + + r = hashmap_replace(UNIT(d)->manager->devices_by_sysfs, copy, first); + if (r < 0) { + LIST_REMOVE(same_sysfs, first, d); + return r; + } + + d->sysfs = TAKE_PTR(copy); + unit_add_to_dbus_queue(UNIT(d)); + + return 0; +} + +static void device_init(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + assert(UNIT(d)->load_state == UNIT_STUB); + + /* In contrast to all other unit types we timeout jobs waiting + * for devices by default. This is because they otherwise wait + * indefinitely for plugged in devices, something which cannot + * happen for the other units since their operations time out + * anyway. */ + u->job_running_timeout = u->manager->default_timeout_start_usec; + + u->ignore_on_isolate = true; + + d->deserialized_state = _DEVICE_STATE_INVALID; +} + +static void device_done(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + + device_unset_sysfs(d); + d->wants_property = strv_free(d->wants_property); +} + +static int device_load(Unit *u) { + int r; + + r = unit_load_fragment_and_dropin(u, false); + if (r < 0) + return r; + + if (!u->description) { + /* Generate a description based on the path, to be used until the + device is initialized properly */ + r = unit_name_to_path(u->id, &u->description); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to unescape name: %m"); + } + + return 0; +} + +static void device_set_state(Device *d, DeviceState state) { + DeviceState old_state; + assert(d); + + if (d->state != state) + bus_unit_send_pending_change_signal(UNIT(d), false); + + old_state = d->state; + d->state = state; + + if (state == DEVICE_DEAD) + device_unset_sysfs(d); + + if (state != old_state) + log_unit_debug(UNIT(d), "Changed %s -> %s", device_state_to_string(old_state), device_state_to_string(state)); + + unit_notify(UNIT(d), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int device_coldplug(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + assert(d->state == DEVICE_DEAD); + + /* First, let's put the deserialized state and found mask into effect, if we have it. */ + + if (d->deserialized_state < 0 || + (d->deserialized_state == d->state && + d->deserialized_found == d->found)) + return 0; + + d->found = d->deserialized_found; + device_set_state(d, d->deserialized_state); + return 0; +} + +static void device_catchup(Unit *u) { + Device *d = DEVICE(u); + + assert(d); + + /* Second, let's update the state with the enumerated state if it's different */ + if (d->enumerated_found == d->found) + return; + + device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK); +} + +static const struct { + DeviceFound flag; + const char *name; +} device_found_map[] = { + { DEVICE_FOUND_UDEV, "found-udev" }, + { DEVICE_FOUND_MOUNT, "found-mount" }, + { DEVICE_FOUND_SWAP, "found-swap" }, +}; + +static int device_found_to_string_many(DeviceFound flags, char **ret) { + _cleanup_free_ char *s = NULL; + unsigned i; + + assert(ret); + + for (i = 0; i < ELEMENTSOF(device_found_map); i++) { + if (!FLAGS_SET(flags, device_found_map[i].flag)) + continue; + + if (!strextend_with_separator(&s, ",", device_found_map[i].name, NULL)) + return -ENOMEM; + } + + *ret = TAKE_PTR(s); + + return 0; +} + +static int device_found_from_string_many(const char *name, DeviceFound *ret) { + DeviceFound flags = 0; + int r; + + assert(ret); + + for (;;) { + _cleanup_free_ char *word = NULL; + DeviceFound f = 0; + unsigned i; + + r = extract_first_word(&name, &word, ",", 0); + if (r < 0) + return r; + if (r == 0) + break; + + for (i = 0; i < ELEMENTSOF(device_found_map); i++) + if (streq(word, device_found_map[i].name)) { + f = device_found_map[i].flag; + break; + } + + if (f == 0) + return -EINVAL; + + flags |= f; + } + + *ret = flags; + return 0; +} + +static int device_serialize(Unit *u, FILE *f, FDSet *fds) { + _cleanup_free_ char *s = NULL; + Device *d = DEVICE(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", device_state_to_string(d->state)); + + if (device_found_to_string_many(d->found, &s) >= 0) + (void) serialize_item(f, "found", s); + + return 0; +} + +static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Device *d = DEVICE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + DeviceState state; + + state = device_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value, ignoring: %s", value); + else + d->deserialized_state = state; + + } else if (streq(key, "found")) { + r = device_found_from_string_many(value, &d->deserialized_found); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse found value '%s', ignoring: %m", value); + + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void device_dump(Unit *u, FILE *f, const char *prefix) { + Device *d = DEVICE(u); + _cleanup_free_ char *s = NULL; + + assert(d); + + (void) device_found_to_string_many(d->found, &s); + + fprintf(f, + "%sDevice State: %s\n" + "%sSysfs Path: %s\n" + "%sFound: %s\n", + prefix, device_state_to_string(d->state), + prefix, strna(d->sysfs), + prefix, strna(s)); + + if (!strv_isempty(d->wants_property)) { + char **i; + + STRV_FOREACH(i, d->wants_property) + fprintf(f, "%sudev SYSTEMD_WANTS: %s\n", + prefix, *i); + } +} + +_pure_ static UnitActiveState device_active_state(Unit *u) { + assert(u); + + return state_translation_table[DEVICE(u)->state]; +} + +_pure_ static const char *device_sub_state_to_string(Unit *u) { + assert(u); + + return device_state_to_string(DEVICE(u)->state); +} + +static int device_update_description(Unit *u, sd_device *dev, const char *path) { + _cleanup_free_ char *j = NULL; + const char *model, *label, *desc; + int r; + + assert(u); + assert(path); + + desc = path; + + if (dev && + (sd_device_get_property_value(dev, "ID_MODEL_FROM_DATABASE", &model) >= 0 || + sd_device_get_property_value(dev, "ID_MODEL", &model) >= 0)) { + desc = model; + + /* Try to concatenate the device model string with a label, if there is one */ + if (sd_device_get_property_value(dev, "ID_FS_LABEL", &label) >= 0 || + sd_device_get_property_value(dev, "ID_PART_ENTRY_NAME", &label) >= 0 || + sd_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER", &label) >= 0) { + + desc = j = strjoin(model, " ", label); + if (!j) + return log_oom(); + } + } + + r = unit_set_description(u, desc); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set device description: %m"); + + return 0; +} + +static int device_add_udev_wants(Unit *u, sd_device *dev) { + _cleanup_strv_free_ char **added = NULL; + const char *wants, *property; + Device *d = DEVICE(u); + int r; + + assert(d); + assert(dev); + + property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS"; + + r = sd_device_get_property_value(dev, property, &wants); + if (r < 0) + return 0; + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&wants, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to parse property %s with value %s: %m", property, wants); + + if (unit_name_is_valid(word, UNIT_NAME_TEMPLATE) && d->sysfs) { + _cleanup_free_ char *escaped = NULL; + + /* If the unit name is specified as template, then automatically fill in the sysfs path of the + * device as instance name, properly escaped. */ + + r = unit_name_path_escape(d->sysfs, &escaped); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to escape %s: %m", d->sysfs); + + r = unit_name_replace_instance(word, escaped, &k); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to build %s instance of template %s: %m", escaped, word); + } else { + /* If this is not a template, then let's mangle it so, that it becomes a valid unit name. */ + + r = unit_name_mangle(word, UNIT_NAME_MANGLE_WARN, &k); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to mangle unit name \"%s\": %m", word); + } + + r = unit_add_dependency_by_name(u, UNIT_WANTS, k, true, UNIT_DEPENDENCY_UDEV); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add Wants= dependency: %m"); + + r = strv_push(&added, k); + if (r < 0) + return log_oom(); + + k = NULL; + } + + if (d->state != DEVICE_DEAD) { + char **i; + + /* So here's a special hack, to compensate for the fact that the udev database's reload cycles are not + * synchronized with our own reload cycles: when we detect that the SYSTEMD_WANTS property of a device + * changes while the device unit is already up, let's manually trigger any new units listed in it not + * seen before. This typically happens during the boot-time switch root transition, as udev devices + * will generally already be up in the initrd, but SYSTEMD_WANTS properties get then added through udev + * rules only available on the host system, and thus only when the initial udev coldplug trigger runs. + * + * We do this only if the device has been up already when we parse this, as otherwise the usual + * dependency logic that is run from the dead → plugged transition will trigger these deps. */ + + STRV_FOREACH(i, added) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (strv_contains(d->wants_property, *i)) /* Was this unit already listed before? */ + continue; + + r = manager_add_job_by_name(u->manager, JOB_START, *i, JOB_FAIL, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue SYSTEMD_WANTS= job, ignoring: %s", bus_error_message(&error, r)); + } + } + + return strv_free_and_replace(d->wants_property, added); +} + +static bool device_is_bound_by_mounts(Device *d, sd_device *dev) { + const char *bound_by; + int r; + + assert(d); + assert(dev); + + if (sd_device_get_property_value(dev, "SYSTEMD_MOUNT_DEVICE_BOUND", &bound_by) >= 0) { + r = parse_boolean(bound_by); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_MOUNT_DEVICE_BOUND='%s' udev property, ignoring: %m", bound_by); + + d->bind_mounts = r > 0; + } else + d->bind_mounts = false; + + return d->bind_mounts; +} + +static void device_upgrade_mount_deps(Unit *u) { + Unit *other; + void *v; + int r; + + /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */ + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REQUIRED_BY]) { + if (other->type != UNIT_MOUNT) + continue; + + r = unit_add_dependency(other, UNIT_BINDS_TO, u, true, UNIT_DEPENDENCY_UDEV); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to add BindsTo= dependency between device and mount unit, ignoring: %m"); + } +} + +static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool main) { + _cleanup_free_ char *e = NULL; + const char *sysfs = NULL; + Unit *u = NULL; + bool delete; + int r; + + assert(m); + assert(path); + + if (dev) { + r = sd_device_get_syspath(dev, &sysfs); + if (r < 0) { + log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m"); + return 0; + } + } + + r = unit_name_from_path(path, ".device", &e); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to generate unit name from device path: %m"); + + u = manager_get_unit(m, e); + if (u) { + /* The device unit can still be present even if the device was unplugged: a mount unit can reference it + * hence preventing the GC to have garbaged it. That's desired since the device unit may have a + * dependency on the mount unit which was added during the loading of the later. When the device is + * plugged the sysfs might not be initialized yet, as we serialize the device's state but do not + * serialize the sysfs path across reloads/reexecs. Hence, when coming back from a reload/restart we + * might have the state valid, but not the sysfs path. Hence, let's filter out conflicting devices, but + * let's accept devices in any state with no sysfs path set. */ + + if (DEVICE(u)->state == DEVICE_PLUGGED && + DEVICE(u)->sysfs && + sysfs && + !path_equal(DEVICE(u)->sysfs, sysfs)) { + log_unit_debug(u, "Device %s appeared twice with different sysfs paths %s and %s, ignoring the latter.", + e, DEVICE(u)->sysfs, sysfs); + return -EEXIST; + } + + delete = false; + + /* Let's remove all dependencies generated due to udev properties. We'll re-add whatever is configured + * now below. */ + unit_remove_dependencies(u, UNIT_DEPENDENCY_UDEV); + } else { + delete = true; + + r = unit_new_for_name(m, sizeof(Device), e, &u); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to allocate device unit %s: %m", e); + goto fail; + } + + unit_add_to_load_queue(u); + } + + /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be + * initialized. Hence initialize it if necessary. */ + if (sysfs) { + r = device_set_sysfs(DEVICE(u), sysfs); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs); + goto fail; + } + + /* The additional systemd udev properties we only interpret for the main object */ + if (main) + (void) device_add_udev_wants(u, dev); + } + + (void) device_update_description(u, dev, path); + + /* So the user wants the mount units to be bound to the device but a mount unit might has been seen by systemd + * before the device appears on its radar. In this case the device unit is partially initialized and includes + * the deps on the mount unit but at that time the "bind mounts" flag wasn't not present. Fix this up now. */ + if (dev && device_is_bound_by_mounts(DEVICE(u), dev)) + device_upgrade_mount_deps(u); + + return 0; + +fail: + if (delete) + unit_free(u); + + return r; +} + +static int device_process_new(Manager *m, sd_device *dev) { + const char *sysfs, *dn, *alias; + dev_t devnum; + int r; + + assert(m); + + if (sd_device_get_syspath(dev, &sysfs) < 0) + return 0; + + /* Add the main unit named after the sysfs path */ + r = device_setup_unit(m, dev, sysfs, true); + if (r < 0) + return r; + + /* Add an additional unit for the device node */ + if (sd_device_get_devname(dev, &dn) >= 0) + (void) device_setup_unit(m, dev, dn, false); + + /* Add additional units for all symlinks */ + if (sd_device_get_devnum(dev, &devnum) >= 0) { + const char *p; + + FOREACH_DEVICE_DEVLINK(dev, p) { + struct stat st; + + if (PATH_STARTSWITH_SET(p, "/dev/block/", "/dev/char/")) + continue; + + /* Verify that the symlink in the FS actually belongs + * to this device. This is useful to deal with + * conflicting devices, e.g. when two disks want the + * same /dev/disk/by-label/xxx link because they have + * the same label. We want to make sure that the same + * device that won the symlink wins in systemd, so we + * check the device node major/minor */ + if (stat(p, &st) >= 0 && + ((!S_ISBLK(st.st_mode) && !S_ISCHR(st.st_mode)) || + st.st_rdev != devnum)) + continue; + + (void) device_setup_unit(m, dev, p, false); + } + } + + /* Add additional units for all explicitly configured aliases */ + if (sd_device_get_property_value(dev, "SYSTEMD_ALIAS", &alias) < 0) + return 0; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&alias, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_ALIAS property: %m"); + + if (!path_is_absolute(word)) + log_device_warning(dev, "SYSTEMD_ALIAS is not an absolute path, ignoring: %s", word); + else if (!path_is_normalized(word)) + log_device_warning(dev, "SYSTEMD_ALIAS is not a normalized path, ignoring: %s", word); + else + (void) device_setup_unit(m, dev, word, false); + } + + return 0; +} + +static void device_found_changed(Device *d, DeviceFound previous, DeviceFound now) { + assert(d); + + /* Didn't exist before, but does now? if so, generate a new invocation ID for it */ + if (previous == DEVICE_NOT_FOUND && now != DEVICE_NOT_FOUND) + (void) unit_acquire_invocation_id(UNIT(d)); + + if (FLAGS_SET(now, DEVICE_FOUND_UDEV)) + /* When the device is known to udev we consider it plugged. */ + device_set_state(d, DEVICE_PLUGGED); + else if (now != DEVICE_NOT_FOUND && !FLAGS_SET(previous, DEVICE_FOUND_UDEV)) + /* If the device has not been seen by udev yet, but is now referenced by the kernel, then we assume the + * kernel knows it now, and udev might soon too. */ + device_set_state(d, DEVICE_TENTATIVE); + else + /* If nobody sees the device, or if the device was previously seen by udev and now is only referenced + * from the kernel, then we consider the device is gone, the kernel just hasn't noticed it yet. */ + device_set_state(d, DEVICE_DEAD); +} + +static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask) { + Manager *m; + + assert(d); + + m = UNIT(d)->manager; + + if (MANAGER_IS_RUNNING(m) && (m->honor_device_enumeration || MANAGER_IS_USER(m))) { + DeviceFound n, previous; + + /* When we are already running, then apply the new mask right-away, and trigger state changes + * right-away */ + + n = (d->found & ~mask) | (found & mask); + if (n == d->found) + return; + + previous = d->found; + d->found = n; + + device_found_changed(d, previous, n); + } else + /* We aren't running yet, let's apply the new mask to the shadow variable instead, which we'll apply as + * soon as we catch-up with the state. */ + d->enumerated_found = (d->enumerated_found & ~mask) | (found & mask); +} + +static void device_update_found_by_sysfs(Manager *m, const char *sysfs, DeviceFound found, DeviceFound mask) { + Device *d, *l, *n; + + assert(m); + assert(sysfs); + + if (mask == 0) + return; + + l = hashmap_get(m->devices_by_sysfs, sysfs); + LIST_FOREACH_SAFE(same_sysfs, d, n, l) + device_update_found_one(d, found, mask); +} + +static int device_update_found_by_name(Manager *m, const char *path, DeviceFound found, DeviceFound mask) { + _cleanup_free_ char *e = NULL; + Unit *u; + int r; + + assert(m); + assert(path); + + if (mask == 0) + return 0; + + r = unit_name_from_path(path, ".device", &e); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name from device path: %m"); + + u = manager_get_unit(m, e); + if (!u) + return 0; + + device_update_found_one(DEVICE(u), found, mask); + return 0; +} + +static bool device_is_ready(sd_device *dev) { + const char *ready; + + assert(dev); + + if (device_is_renaming(dev) > 0) + return false; + + /* Is it really tagged as 'systemd' right now? */ + if (sd_device_has_current_tag(dev, "systemd") <= 0) + return false; + + if (sd_device_get_property_value(dev, "SYSTEMD_READY", &ready) < 0) + return true; + + return parse_boolean(ready) != 0; +} + +static Unit *device_following(Unit *u) { + Device *d = DEVICE(u); + Device *other, *first = NULL; + + assert(d); + + if (startswith(u->id, "sys-")) + return NULL; + + /* Make everybody follow the unit that's named after the sysfs path */ + LIST_FOREACH_AFTER(same_sysfs, other, d) + if (startswith(UNIT(other)->id, "sys-")) + return UNIT(other); + + LIST_FOREACH_BEFORE(same_sysfs, other, d) { + if (startswith(UNIT(other)->id, "sys-")) + return UNIT(other); + + first = other; + } + + return UNIT(first); +} + +static int device_following_set(Unit *u, Set **_set) { + Device *d = DEVICE(u), *other; + _cleanup_set_free_ Set *set = NULL; + int r; + + assert(d); + assert(_set); + + if (LIST_JUST_US(same_sysfs, d)) { + *_set = NULL; + return 0; + } + + set = set_new(NULL); + if (!set) + return -ENOMEM; + + LIST_FOREACH_AFTER(same_sysfs, other, d) { + r = set_put(set, other); + if (r < 0) + return r; + } + + LIST_FOREACH_BEFORE(same_sysfs, other, d) { + r = set_put(set, other); + if (r < 0) + return r; + } + + *_set = TAKE_PTR(set); + return 1; +} + +static void device_shutdown(Manager *m) { + assert(m); + + m->device_monitor = sd_device_monitor_unref(m->device_monitor); + m->devices_by_sysfs = hashmap_free(m->devices_by_sysfs); +} + +static void device_enumerate(Manager *m) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *dev; + int r; + + assert(m); + + if (!m->device_monitor) { + r = sd_device_monitor_new(&m->device_monitor); + if (r < 0) { + log_error_errno(r, "Failed to allocate device monitor: %m"); + goto fail; + } + + /* This will fail if we are unprivileged, but that + * should not matter much, as user instances won't run + * during boot. */ + (void) sd_device_monitor_set_receive_buffer_size(m->device_monitor, 128*1024*1024); + + r = sd_device_monitor_filter_add_match_tag(m->device_monitor, "systemd"); + if (r < 0) { + log_error_errno(r, "Failed to add udev tag match: %m"); + goto fail; + } + + r = sd_device_monitor_attach_event(m->device_monitor, m->event); + if (r < 0) { + log_error_errno(r, "Failed to attach event to device monitor: %m"); + goto fail; + } + + r = sd_device_monitor_start(m->device_monitor, device_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to start device monitor: %m"); + goto fail; + } + } + + r = sd_device_enumerator_new(&e); + if (r < 0) { + log_error_errno(r, "Failed to allocate device enumerator: %m"); + goto fail; + } + + r = sd_device_enumerator_add_match_tag(e, "systemd"); + if (r < 0) { + log_error_errno(r, "Failed to set tag for device enumeration: %m"); + goto fail; + } + + FOREACH_DEVICE(e, dev) { + const char *sysfs; + + if (!device_is_ready(dev)) + continue; + + (void) device_process_new(m, dev); + + if (sd_device_get_syspath(dev, &sysfs) < 0) + continue; + + device_update_found_by_sysfs(m, sysfs, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV); + } + + return; + +fail: + device_shutdown(m); +} + +static void device_propagate_reload_by_sysfs(Manager *m, const char *sysfs) { + Device *d, *l, *n; + int r; + + assert(m); + assert(sysfs); + + l = hashmap_get(m->devices_by_sysfs, sysfs); + LIST_FOREACH_SAFE(same_sysfs, d, n, l) { + if (d->state == DEVICE_DEAD) + continue; + + r = manager_propagate_reload(m, UNIT(d), JOB_REPLACE, NULL); + if (r < 0) + log_warning_errno(r, "Failed to propagate reload, ignoring: %m"); + } +} + +static int device_remove_old(Manager *m, sd_device *dev) { + _cleanup_free_ char *syspath_old = NULL, *e = NULL; + const char *devpath_old; + int r; + + r = sd_device_get_property_value(dev, "DEVPATH_OLD", &devpath_old); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to get DEVPATH_OLD= property on 'move' uevent, ignoring: %m"); + return 0; + } + + syspath_old = path_join("/sys", devpath_old); + if (!syspath_old) + return log_oom(); + + r = unit_name_from_path(syspath_old, ".device", &e); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to generate unit name from old device path: %m"); + + device_update_found_by_sysfs(m, syspath_old, 0, DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP); + return 0; +} + +static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata) { + Manager *m = userdata; + DeviceAction action; + const char *sysfs; + int r; + + assert(m); + assert(dev); + + r = sd_device_get_syspath(dev, &sysfs); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to get device sys path: %m"); + return 0; + } + + r = device_get_action(dev, &action); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to get udev action: %m"); + return 0; + } + + if (!IN_SET(action, DEVICE_ACTION_ADD, DEVICE_ACTION_REMOVE, DEVICE_ACTION_MOVE)) + device_propagate_reload_by_sysfs(m, sysfs); + + if (action == DEVICE_ACTION_MOVE) + (void) device_remove_old(m, dev); + + /* A change event can signal that a device is becoming ready, in particular if the device is using + * the SYSTEMD_READY logic in udev so we need to reach the else block of the following if, even for + * change events */ + if (action == DEVICE_ACTION_REMOVE) { + r = swap_process_device_remove(m, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process swap device remove event, ignoring: %m"); + + /* If we get notified that a device was removed by udev, then it's completely gone, hence + * unset all found bits */ + device_update_found_by_sysfs(m, sysfs, 0, DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP); + + } else if (device_is_ready(dev)) { + + (void) device_process_new(m, dev); + + r = swap_process_device_new(m, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process swap device new event, ignoring: %m"); + + manager_dispatch_load_queue(m); + + /* The device is found now, set the udev found bit */ + device_update_found_by_sysfs(m, sysfs, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV); + + } else + /* The device is nominally around, but not ready for us. Hence unset the udev bit, but leave + * the rest around. */ + device_update_found_by_sysfs(m, sysfs, 0, DEVICE_FOUND_UDEV); + + return 0; +} + +static bool device_supported(void) { + static int read_only = -1; + + /* If /sys is read-only we don't support device units, and any + * attempts to start one should fail immediately. */ + + if (read_only < 0) + read_only = path_is_read_only_fs("/sys"); + + return read_only <= 0; +} + +static int validate_node(Manager *m, const char *node, sd_device **ret) { + struct stat st; + int r; + + assert(m); + assert(node); + assert(ret); + + /* Validates a device node that showed up in /proc/swaps or /proc/self/mountinfo if it makes sense for us to + * track. Note that this validator is fine within missing device nodes, but not with badly set up ones! */ + + if (!path_startswith(node, "/dev")) { + *ret = NULL; + return 0; /* bad! */ + } + + if (stat(node, &st) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to stat() device node file %s: %m", node); + + *ret = NULL; + return 1; /* good! (though missing) */ + + } else { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + r = device_new_from_stat_rdev(&dev, &st); + if (r == -ENOENT) { + *ret = NULL; + return 1; /* good! (though missing) */ + } else if (r == -ENOTTY) { + *ret = NULL; + return 0; /* bad! (not a device node but some other kind of file system node) */ + } else if (r < 0) + return log_error_errno(r, "Failed to get udev device from devnum %u:%u: %m", major(st.st_rdev), minor(st.st_rdev)); + + *ret = TAKE_PTR(dev); + return 1; /* good! */ + } +} + +void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask) { + int r; + + assert(m); + assert(node); + + if (!device_supported()) + return; + + if (mask == 0) + return; + + /* This is called whenever we find a device referenced in /proc/swaps or /proc/self/mounts. Such a device might + * be mounted/enabled at a time where udev has not finished probing it yet, and we thus haven't learned about + * it yet. In this case we will set the device unit to "tentative" state. + * + * This takes a pair of DeviceFound flags parameters. The 'mask' parameter is a bit mask that indicates which + * bits of 'found' to copy into the per-device DeviceFound flags field. Thus, this function may be used to set + * and unset individual bits in a single call, while merging partially with previous state. */ + + if ((found & mask) != 0) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + /* If the device is known in the kernel and newly appeared, then we'll create a device unit for it, + * under the name referenced in /proc/swaps or /proc/self/mountinfo. But first, let's validate if + * everything is alright with the device node. */ + + r = validate_node(m, node, &dev); + if (r <= 0) + return; /* Don't create a device unit for this if the device node is borked. */ + + (void) device_setup_unit(m, dev, node, false); + } + + /* Update the device unit's state, should it exist */ + (void) device_update_found_by_name(m, node, found, mask); +} + +bool device_shall_be_bound_by(Unit *device, Unit *u) { + assert(device); + assert(u); + + if (u->type != UNIT_MOUNT) + return false; + + return DEVICE(device)->bind_mounts; +} + +const UnitVTable device_vtable = { + .object_size = sizeof(Device), + .sections = + "Unit\0" + "Device\0" + "Install\0", + + .gc_jobs = true, + + .init = device_init, + .done = device_done, + .load = device_load, + + .coldplug = device_coldplug, + .catchup = device_catchup, + + .serialize = device_serialize, + .deserialize_item = device_deserialize_item, + + .dump = device_dump, + + .active_state = device_active_state, + .sub_state_to_string = device_sub_state_to_string, + + .following = device_following, + .following_set = device_following_set, + + .enumerate = device_enumerate, + .shutdown = device_shutdown, + .supported = device_supported, + + .status_message_formats = { + .starting_stopping = { + [0] = "Expecting device %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Found device %s.", + [JOB_TIMEOUT] = "Timed out waiting for device %s.", + }, + }, +}; diff --git a/src/core/device.h b/src/core/device.h new file mode 100644 index 0000000..dfe8a13 --- /dev/null +++ b/src/core/device.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +typedef struct Device Device; + +/* A mask specifying where we have seen the device currently. This is a bitmask because the device might show up + * asynchronously from each other at various places. For example, in very common case a device might already be mounted + * before udev finished probing it (think: a script setting up a loopback block device, formatting it and mounting it + * in quick succession). Hence we need to track precisely where it is already visible and where not. */ +typedef enum DeviceFound { + DEVICE_NOT_FOUND = 0, + DEVICE_FOUND_UDEV = 1 << 0, /* The device has shown up in the udev database */ + DEVICE_FOUND_MOUNT = 1 << 1, /* The device has shown up in /proc/self/mountinfo */ + DEVICE_FOUND_SWAP = 1 << 2, /* The device has shown up in /proc/swaps */ + DEVICE_FOUND_MASK = DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP, +} DeviceFound; + +struct Device { + Unit meta; + + char *sysfs; + + /* In order to be able to distinguish dependencies on different device nodes we might end up creating multiple + * devices for the same sysfs path. We chain them up here. */ + LIST_FIELDS(struct Device, same_sysfs); + + DeviceState state, deserialized_state; + DeviceFound found, deserialized_found, enumerated_found; + + bool bind_mounts; + + /* The SYSTEMD_WANTS udev property for this device the last time we saw it */ + char **wants_property; +}; + +extern const UnitVTable device_vtable; + +void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask); +bool device_shall_be_bound_by(Unit *device, Unit *u); + +DEFINE_CAST(DEVICE, Device); diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c new file mode 100644 index 0000000..7da87fd --- /dev/null +++ b/src/core/dynamic-user.c @@ -0,0 +1,825 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "clean-ipc.h" +#include "dynamic-user.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "nscd-flush.h" +#include "parse-util.h" +#include "random-util.h" +#include "serialize.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-record.h" +#include "user-util.h" + +/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */ +#define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN) + +DEFINE_PRIVATE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user); + +static DynamicUser* dynamic_user_free(DynamicUser *d) { + if (!d) + return NULL; + + if (d->manager) + (void) hashmap_remove(d->manager->dynamic_users, d->name); + + safe_close_pair(d->storage_socket); + return mfree(d); +} + +static int dynamic_user_add(Manager *m, const char *name, int storage_socket[static 2], DynamicUser **ret) { + DynamicUser *d; + int r; + + assert(m); + assert(name); + assert(storage_socket); + + r = hashmap_ensure_allocated(&m->dynamic_users, &string_hash_ops); + if (r < 0) + return r; + + d = malloc0(offsetof(DynamicUser, name) + strlen(name) + 1); + if (!d) + return -ENOMEM; + + strcpy(d->name, name); + + d->storage_socket[0] = storage_socket[0]; + d->storage_socket[1] = storage_socket[1]; + + r = hashmap_put(m->dynamic_users, d->name, d); + if (r < 0) { + free(d); + return r; + } + + d->manager = m; + + if (ret) + *ret = d; + + return 0; +} + +static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) { + _cleanup_close_pair_ int storage_socket[2] = { -1, -1 }; + DynamicUser *d; + int r; + + assert(m); + assert(name); + + /* Return the DynamicUser structure for a specific user name. Note that this won't actually allocate a UID for + * it, but just prepare the data structure for it. The UID is allocated only on demand, when it's really + * needed, and in the child process we fork off, since allocation involves NSS checks which are not OK to do + * from PID 1. To allow the children and PID 1 share information about allocated UIDs we use an anonymous + * AF_UNIX/SOCK_DGRAM socket (called the "storage socket") that contains at most one datagram with the + * allocated UID number, plus an fd referencing the lock file for the UID + * (i.e. /run/systemd/dynamic-uid/$UID). Why involve the socket pair? So that PID 1 and all its children can + * share the same storage for the UID and lock fd, simply by inheriting the storage socket fds. The socket pair + * may exist in three different states: + * + * a) no datagram stored. This is the initial state. In this case the dynamic user was never realized. + * + * b) a datagram containing a UID stored, but no lock fd attached to it. In this case there was already a + * statically assigned UID by the same name, which we are reusing. + * + * c) a datagram containing a UID stored, and a lock fd is attached to it. In this case we allocated a dynamic + * UID and locked it in the file system, using the lock fd. + * + * As PID 1 and various children might access the socket pair simultaneously, and pop the datagram or push it + * back in any time, we also maintain a lock on the socket pair. Note one peculiarity regarding locking here: + * the UID lock on disk is protected via a BSD file lock (i.e. an fd-bound lock), so that the lock is kept in + * place as long as there's a reference to the fd open. The lock on the storage socket pair however is a POSIX + * file lock (i.e. a process-bound lock), as all users share the same fd of this (after all it is anonymous, + * nobody else could get any access to it except via our own fd) and we want to synchronize access between all + * processes that have access to it. */ + + d = hashmap_get(m->dynamic_users, name); + if (d) { + if (ret) { + /* We already have a structure for the dynamic user, let's increase the ref count and reuse it */ + d->n_ref++; + *ret = d; + } + return 0; + } + + if (!valid_user_group_name(name, VALID_USER_ALLOW_NUMERIC)) + return -EINVAL; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, storage_socket) < 0) + return -errno; + + r = dynamic_user_add(m, name, storage_socket, &d); + if (r < 0) + return r; + + storage_socket[0] = storage_socket[1] = -1; + + if (ret) { + d->n_ref++; + *ret = d; + } + + return 1; +} + +static int make_uid_symlinks(uid_t uid, const char *name, bool b) { + + char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1]; + const char *path2; + int r = 0, k; + + /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The + * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it + * would be its own client then). We hence keep these world-readable symlinks in place, so that the + * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go + * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks + * on them and as those may be taken by any user with read access we can't make them world-readable. */ + + xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid); + if (unlink(path1) < 0 && errno != ENOENT) + r = -errno; + + if (b && symlink(name, path1) < 0) { + k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path1); + if (r == 0) + r = k; + } + + path2 = strjoina("/run/systemd/dynamic-uid/direct:", name); + if (unlink(path2) < 0 && errno != ENOENT) { + k = -errno; + if (r == 0) + r = k; + } + + if (b && symlink(path1 + STRLEN("/run/systemd/dynamic-uid/direct:"), path2) < 0) { + k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path2); + if (r == 0) + r = k; + } + + return r; +} + +static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) { + + /* Find a suitable free UID. We use the following strategy to find a suitable UID: + * + * 1. Initially, we try to read the UID of a number of specified paths. If any of these UIDs works, we use + * them. We use in order to increase the chance of UID reuse, if StateDirectory=, CacheDirectory= or + * LogsDirectory= are used, as reusing the UID these directories are owned by saves us from having to + * recursively chown() them to new users. + * + * 2. If that didn't yield a currently unused UID, we hash the user name, and try to use that. This should be + * pretty good, as the use ris by default derived from the unit name, and hence the same service and same + * user should usually get the same UID as long as our hashing doesn't clash. + * + * 3. Finally, if that didn't work, we randomly pick UIDs, until we find one that is empty. + * + * Since the dynamic UID space is relatively small we'll stop trying after 100 iterations, giving up. */ + + enum { + PHASE_SUGGESTED, /* the first phase, reusing directory ownership UIDs */ + PHASE_HASHED, /* the second phase, deriving a UID from the username by hashing */ + PHASE_RANDOM, /* the last phase, randomly picking UIDs */ + } phase = PHASE_SUGGESTED; + + static const uint8_t hash_key[] = { + 0x37, 0x53, 0x7e, 0x31, 0xcf, 0xce, 0x48, 0xf5, + 0x8a, 0xbb, 0x39, 0x57, 0x8d, 0xd9, 0xec, 0x59 + }; + + unsigned n_tries = 100, current_suggested = 0; + int r; + + (void) mkdir("/run/systemd/dynamic-uid", 0755); + + for (;;) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_close_ int lock_fd = -1; + uid_t candidate; + ssize_t l; + + if (--n_tries <= 0) /* Give up retrying eventually */ + return -EBUSY; + + switch (phase) { + + case PHASE_SUGGESTED: { + struct stat st; + + if (!suggested_paths || !suggested_paths[current_suggested]) { + /* We reached the end of the suggested paths list, let's try by hashing the name */ + phase = PHASE_HASHED; + continue; + } + + if (stat(suggested_paths[current_suggested++], &st) < 0) + continue; /* We can't read the UID of this path, but that doesn't matter, just try the next */ + + candidate = st.st_uid; + break; + } + + case PHASE_HASHED: + /* A static user by this name does not exist yet. Let's find a free ID then, and use that. We + * start with a UID generated as hash from the user name. */ + candidate = UID_CLAMP_INTO_RANGE(siphash24(name, strlen(name), hash_key)); + + /* If this one fails, we should proceed with random tries */ + phase = PHASE_RANDOM; + break; + + case PHASE_RANDOM: + + /* Pick another random UID, and see if that works for us. */ + random_bytes(&candidate, sizeof(candidate)); + candidate = UID_CLAMP_INTO_RANGE(candidate); + break; + + default: + assert_not_reached("unknown phase"); + } + + /* Make sure whatever we picked here actually is in the right range */ + if (!uid_is_dynamic(candidate)) + continue; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, candidate); + + for (;;) { + struct stat st; + + lock_fd = open(lock_path, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600); + if (lock_fd < 0) + return -errno; + + r = flock(lock_fd, LOCK_EX|LOCK_NB); /* Try to get a BSD file lock on the UID lock file */ + if (r < 0) { + if (IN_SET(errno, EBUSY, EAGAIN)) + goto next; /* already in use */ + + return -errno; + } + + if (fstat(lock_fd, &st) < 0) + return -errno; + if (st.st_nlink > 0) + break; + + /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and + * got the lock. Close it, and try again. */ + lock_fd = safe_close(lock_fd); + } + + /* Some superficial check whether this UID/GID might already be taken by some static user */ + if (getpwuid(candidate) || + getgrgid((gid_t) candidate) || + search_ipc(candidate, (gid_t) candidate) != 0) { + (void) unlink(lock_path); + continue; + } + + /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */ + l = pwritev(lock_fd, + (struct iovec[2]) { + IOVEC_INIT_STRING(name), + IOVEC_INIT((char[1]) { '\n' }, 1), + }, 2, 0); + if (l < 0) { + r = -errno; + (void) unlink(lock_path); + return r; + } + + (void) ftruncate(lock_fd, l); + (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */ + + *ret_uid = candidate; + return TAKE_FD(lock_fd); + + next: + ; + } +} + +static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) { + uid_t uid = UID_INVALID; + struct iovec iov = IOVEC_INIT(&uid, sizeof(uid)); + int lock_fd; + ssize_t k; + + assert(d); + assert(ret_uid); + assert(ret_lock_fd); + + /* Read the UID and lock fd that is stored in the storage AF_UNIX socket. This should be called with the lock + * on the socket taken. */ + + k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd); + if (k < 0) + return (int) k; + + *ret_uid = uid; + *ret_lock_fd = lock_fd; + + return 0; +} + +static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) { + struct iovec iov = IOVEC_INIT(&uid, sizeof(uid)); + + assert(d); + + /* Store the UID and lock_fd in the storage socket. This should be called with the socket pair lock taken. */ + return send_one_fd_iov(d->storage_socket[1], lock_fd, &iov, 1, MSG_DONTWAIT); +} + +static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + + if (lock_fd < 0) + return; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); + (void) unlink(lock_path); + + (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */ +} + +static int lockfp(int fd, int *fd_lock) { + if (lockf(fd, F_LOCK, 0) < 0) + return -errno; + *fd_lock = fd; + return 0; +} + +static void unlockfp(int *fd_lock) { + if (*fd_lock < 0) + return; + lockf(*fd_lock, F_ULOCK, 0); + *fd_lock = -1; +} + +static int dynamic_user_realize( + DynamicUser *d, + char **suggested_dirs, + uid_t *ret_uid, gid_t *ret_gid, + bool is_user) { + + _cleanup_(unlockfp) int storage_socket0_lock = -1; + _cleanup_close_ int uid_lock_fd = -1; + _cleanup_close_ int etc_passwd_lock_fd = -1; + uid_t num = UID_INVALID; /* a uid if is_user, and a gid otherwise */ + gid_t gid = GID_INVALID; /* a gid if is_user, ignored otherwise */ + bool flush_cache = false; + int r; + + assert(d); + assert(is_user == !!ret_uid); + assert(ret_gid); + + /* Acquire a UID for the user name. This will allocate a UID for the user name if the user doesn't exist + * yet. If it already exists its existing UID/GID will be reused. */ + + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) + return r; + + r = dynamic_user_pop(d, &num, &uid_lock_fd); + if (r < 0) { + int new_uid_lock_fd; + uid_t new_uid; + + if (r != -EAGAIN) + return r; + + /* OK, nothing stored yet, let's try to find something useful. While we are working on this release the + * lock however, so that nobody else blocks on our NSS lookups. */ + unlockfp(&storage_socket0_lock); + + /* Let's see if a proper, static user or group by this name exists. Try to take the lock on + * /etc/passwd, if that fails with EROFS then /etc is read-only. In that case it's fine if we don't + * take the lock, given that users can't be added there anyway in this case. */ + etc_passwd_lock_fd = take_etc_passwd_lock(NULL); + if (etc_passwd_lock_fd < 0 && etc_passwd_lock_fd != -EROFS) + return etc_passwd_lock_fd; + + /* First, let's parse this as numeric UID */ + r = parse_uid(d->name, &num); + if (r < 0) { + struct passwd *p; + struct group *g; + + if (is_user) { + /* OK, this is not a numeric UID. Let's see if there's a user by this name */ + p = getpwnam(d->name); + if (p) { + num = p->pw_uid; + gid = p->pw_gid; + } else { + /* if the user does not exist but the group with the same name exists, refuse operation */ + g = getgrnam(d->name); + if (g) + return -EILSEQ; + } + } else { + /* Let's see if there's a group by this name */ + g = getgrnam(d->name); + if (g) + num = (uid_t) g->gr_gid; + else { + /* if the group does not exist but the user with the same name exists, refuse operation */ + p = getpwnam(d->name); + if (p) + return -EILSEQ; + } + } + } + + if (num == UID_INVALID) { + /* No static UID assigned yet, excellent. Let's pick a new dynamic one, and lock it. */ + + uid_lock_fd = pick_uid(suggested_dirs, d->name, &num); + if (uid_lock_fd < 0) + return uid_lock_fd; + } + + /* So, we found a working UID/lock combination. Let's see if we actually still need it. */ + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) { + unlink_uid_lock(uid_lock_fd, num, d->name); + return r; + } + + r = dynamic_user_pop(d, &new_uid, &new_uid_lock_fd); + if (r < 0) { + if (r != -EAGAIN) { + /* OK, something bad happened, let's get rid of the bits we acquired. */ + unlink_uid_lock(uid_lock_fd, num, d->name); + return r; + } + + /* Great! Nothing is stored here, still. Store our newly acquired data. */ + flush_cache = true; + } else { + /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we + * acquired, and use what's stored now. */ + + unlink_uid_lock(uid_lock_fd, num, d->name); + safe_close(uid_lock_fd); + + num = new_uid; + uid_lock_fd = new_uid_lock_fd; + } + } else if (is_user && !uid_is_dynamic(num)) { + struct passwd *p; + + /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */ + errno = 0; + p = getpwuid(num); + if (!p) + return errno_or_else(ESRCH); + + gid = p->pw_gid; + } + + /* If the UID/GID was already allocated dynamically, push the data we popped out back in. If it was already + * allocated statically, push the UID back too, but do not push the lock fd in. If we allocated the UID + * dynamically right here, push that in along with the lock fd for it. */ + r = dynamic_user_push(d, num, uid_lock_fd); + if (r < 0) + return r; + + if (flush_cache) { + /* If we allocated a new dynamic UID, refresh nscd, so that it forgets about potentially cached + * negative entries. But let's do so after we release the /etc/passwd lock, so that there's no + * potential for nscd wanting to lock that for completing the invalidation. */ + etc_passwd_lock_fd = safe_close(etc_passwd_lock_fd); + (void) nscd_flush_cache(STRV_MAKE("passwd", "group")); + } + + if (is_user) { + *ret_uid = num; + *ret_gid = gid != GID_INVALID ? gid : num; + } else + *ret_gid = num; + + return 0; +} + +int dynamic_user_current(DynamicUser *d, uid_t *ret) { + _cleanup_(unlockfp) int storage_socket0_lock = -1; + _cleanup_close_ int lock_fd = -1; + uid_t uid; + int r; + + assert(d); + + /* Get the currently assigned UID for the user, if there's any. This simply pops the data from the storage socket, and pushes it back in right-away. */ + + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) + return r; + + r = dynamic_user_pop(d, &uid, &lock_fd); + if (r < 0) + return r; + + r = dynamic_user_push(d, uid, lock_fd); + if (r < 0) + return r; + + if (ret) + *ret = uid; + + return 0; +} + +static DynamicUser* dynamic_user_unref(DynamicUser *d) { + if (!d) + return NULL; + + /* Note that this doesn't actually release any resources itself. If a dynamic user should be fully destroyed + * and its UID released, use dynamic_user_destroy() instead. NB: the dynamic user table may contain entries + * with no references, which is commonly the case right before a daemon reload. */ + + assert(d->n_ref > 0); + d->n_ref--; + + return NULL; +} + +static int dynamic_user_close(DynamicUser *d) { + _cleanup_(unlockfp) int storage_socket0_lock = -1; + _cleanup_close_ int lock_fd = -1; + uid_t uid; + int r; + + /* Release the user ID, by releasing the lock on it, and emptying the storage socket. After this the user is + * unrealized again, much like it was after it the DynamicUser object was first allocated. */ + + r = lockfp(d->storage_socket[0], &storage_socket0_lock); + if (r < 0) + return r; + + r = dynamic_user_pop(d, &uid, &lock_fd); + if (r == -EAGAIN) + /* User wasn't realized yet, nothing to do. */ + return 0; + if (r < 0) + return r; + + /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */ + unlink_uid_lock(lock_fd, uid, d->name); + + (void) nscd_flush_cache(STRV_MAKE("passwd", "group")); + return 1; +} + +static DynamicUser* dynamic_user_destroy(DynamicUser *d) { + if (!d) + return NULL; + + /* Drop a reference to a DynamicUser object, and destroy the user completely if this was the last + * reference. This is called whenever a service is shut down and wants its dynamic UID gone. Note that + * dynamic_user_unref() is what is called whenever a service is simply freed, for example during a reload + * cycle, where the dynamic users should not be destroyed, but our datastructures should. */ + + dynamic_user_unref(d); + + if (d->n_ref > 0) + return NULL; + + (void) dynamic_user_close(d); + return dynamic_user_free(d); +} + +int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds) { + DynamicUser *d; + + assert(m); + assert(f); + assert(fds); + + /* Dump the dynamic user database into the manager serialization, to deal with daemon reloads. */ + + HASHMAP_FOREACH(d, m->dynamic_users) { + int copy0, copy1; + + copy0 = fdset_put_dup(fds, d->storage_socket[0]); + if (copy0 < 0) + return log_error_errno(copy0, "Failed to add dynamic user storage fd to serialization: %m"); + + copy1 = fdset_put_dup(fds, d->storage_socket[1]); + if (copy1 < 0) + return log_error_errno(copy1, "Failed to add dynamic user storage fd to serialization: %m"); + + (void) serialize_item_format(f, "dynamic-user", "%s %i %i", d->name, copy0, copy1); + } + + return 0; +} + +void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds) { + _cleanup_free_ char *name = NULL, *s0 = NULL, *s1 = NULL; + int r, fd0, fd1; + + assert(m); + assert(value); + assert(fds); + + /* Parse the serialization again, after a daemon reload */ + + r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL); + if (r != 3 || !isempty(value)) { + log_debug("Unable to parse dynamic user line."); + return; + } + + if (safe_atoi(s0, &fd0) < 0 || !fdset_contains(fds, fd0)) { + log_debug("Unable to process dynamic user fd specification."); + return; + } + + if (safe_atoi(s1, &fd1) < 0 || !fdset_contains(fds, fd1)) { + log_debug("Unable to process dynamic user fd specification."); + return; + } + + r = dynamic_user_add(m, name, (int[]) { fd0, fd1 }, NULL); + if (r < 0) { + log_debug_errno(r, "Failed to add dynamic user: %m"); + return; + } + + (void) fdset_remove(fds, fd0); + (void) fdset_remove(fds, fd1); +} + +void dynamic_user_vacuum(Manager *m, bool close_user) { + DynamicUser *d; + + assert(m); + + /* Empty the dynamic user database, optionally cleaning up orphaned dynamic users, i.e. destroy and free users + * to which no reference exist. This is called after a daemon reload finished, in order to destroy users which + * might not be referenced anymore. */ + + HASHMAP_FOREACH(d, m->dynamic_users) { + if (d->n_ref > 0) + continue; + + if (close_user) { + log_debug("Removing orphaned dynamic user %s", d->name); + (void) dynamic_user_close(d); + } + + dynamic_user_free(d); + } +} + +int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret) { + char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_free_ char *user = NULL; + uid_t check_uid; + int r; + + assert(m); + assert(ret); + + /* A friendly way to translate a dynamic user's UID into a name. */ + if (!uid_is_dynamic(uid)) + return -ESRCH; + + xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); + r = read_one_line_file(lock_path, &user); + if (IN_SET(r, -ENOENT, 0)) + return -ESRCH; + if (r < 0) + return r; + + /* The lock file might be stale, hence let's verify the data before we return it */ + r = dynamic_user_lookup_name(m, user, &check_uid); + if (r < 0) + return r; + if (check_uid != uid) /* lock file doesn't match our own idea */ + return -ESRCH; + + *ret = TAKE_PTR(user); + + return 0; +} + +int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) { + DynamicUser *d; + int r; + + assert(m); + assert(name); + + /* A friendly call for translating a dynamic user's name into its UID */ + + d = hashmap_get(m->dynamic_users, name); + if (!d) + return -ESRCH; + + r = dynamic_user_current(d, ret); + if (r == -EAGAIN) /* not realized yet? */ + return -ESRCH; + + return r; +} + +int dynamic_creds_acquire(DynamicCreds *creds, Manager *m, const char *user, const char *group) { + bool acquired = false; + int r; + + assert(creds); + assert(m); + + /* A DynamicUser object encapsulates an allocation of both a UID and a GID for a specific name. However, some + * services use different user and groups. For cases like that there's DynamicCreds containing a pair of user + * and group. This call allocates a pair. */ + + if (!creds->user && user) { + r = dynamic_user_acquire(m, user, &creds->user); + if (r < 0) + return r; + + acquired = true; + } + + if (!creds->group) { + + if (creds->user && (!group || streq_ptr(user, group))) + creds->group = dynamic_user_ref(creds->user); + else if (group) { + r = dynamic_user_acquire(m, group, &creds->group); + if (r < 0) { + if (acquired) + creds->user = dynamic_user_unref(creds->user); + return r; + } + } + } + + return 0; +} + +int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid) { + uid_t u = UID_INVALID; + gid_t g = GID_INVALID; + int r; + + assert(creds); + assert(uid); + assert(gid); + + /* Realize both the referenced user and group */ + + if (creds->user) { + r = dynamic_user_realize(creds->user, suggested_paths, &u, &g, true); + if (r < 0) + return r; + } + + if (creds->group && creds->group != creds->user) { + r = dynamic_user_realize(creds->group, suggested_paths, NULL, &g, false); + if (r < 0) + return r; + } + + *uid = u; + *gid = g; + return 0; +} + +void dynamic_creds_unref(DynamicCreds *creds) { + assert(creds); + + creds->user = dynamic_user_unref(creds->user); + creds->group = dynamic_user_unref(creds->group); +} + +void dynamic_creds_destroy(DynamicCreds *creds) { + assert(creds); + + creds->user = dynamic_user_destroy(creds->user); + creds->group = dynamic_user_destroy(creds->group); +} diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h new file mode 100644 index 0000000..847ef47 --- /dev/null +++ b/src/core/dynamic-user.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct DynamicUser DynamicUser; + +typedef struct DynamicCreds { + /* A combination of a dynamic user and group */ + DynamicUser *user; + DynamicUser *group; +} DynamicCreds; + +#include "manager.h" + +/* Note that this object always allocates a pair of user and group under the same name, even if one of them isn't + * used. This means, if you want to allocate a group and user pair, and they might have two different names, then you + * need to allocated two of these objects. DynamicCreds below makes that easy. */ +struct DynamicUser { + Manager *manager; + unsigned n_ref; + + /* An AF_UNIX socket pair that contains a datagram containing both the numeric ID assigned, as well as a lock + * file fd locking the user ID we picked. */ + int storage_socket[2]; + + char name[]; +}; + +int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds); +void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds); +void dynamic_user_vacuum(Manager *m, bool close_user); + +int dynamic_user_current(DynamicUser *d, uid_t *ret); +int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret); +int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret); + +int dynamic_creds_acquire(DynamicCreds *creds, Manager *m, const char *user, const char *group); +int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid); + +void dynamic_creds_unref(DynamicCreds *creds); +void dynamic_creds_destroy(DynamicCreds *creds); diff --git a/src/core/efi-random.c b/src/core/efi-random.c new file mode 100644 index 0000000..2bc74fa --- /dev/null +++ b/src/core/efi-random.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "chattr-util.h" +#include "efi-random.h" +#include "efivars.h" +#include "fd-util.h" +#include "fs-util.h" +#include "random-util.h" +#include "strv.h" + +/* If a random seed was passed by the boot loader in the LoaderRandomSeed EFI variable, let's credit it to + * the kernel's random pool, but only once per boot. If this is run very early during initialization we can + * instantly boot up with a filled random pool. + * + * This makes no judgement on the entropy passed, it's the job of the boot loader to only pass us a seed that + * is suitably validated. */ + +static void lock_down_efi_variables(void) { + const char *p; + int r; + + /* Paranoia: let's restrict access modes of these a bit, so that unprivileged users can't use them to + * identify the system or gain too much insight into what we might have credited to the entropy + * pool. */ + FOREACH_STRING(p, + "/sys/firmware/efi/efivars/LoaderRandomSeed-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f", + "/sys/firmware/efi/efivars/LoaderSystemToken-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f") { + + r = chattr_path(p, 0, FS_IMMUTABLE_FL, NULL); + if (r == -ENOENT) + continue; + if (r < 0) + log_warning_errno(r, "Failed to drop FS_IMMUTABLE_FL from %s, ignoring: %m", p); + + if (chmod(p, 0600) < 0) + log_warning_errno(errno, "Failed to reduce access mode of %s, ignoring: %m", p); + } +} + +int efi_take_random_seed(void) { + _cleanup_free_ void *value = NULL; + _cleanup_close_ int random_fd = -1; + size_t size; + int r; + + /* Paranoia comes first. */ + lock_down_efi_variables(); + + if (access("/run/systemd/efi-random-seed-taken", F_OK) < 0) { + if (errno != ENOENT) { + log_warning_errno(errno, "Failed to determine whether we already used the random seed token, not using it."); + return 0; + } + + /* ENOENT means we haven't used it yet. */ + } else { + log_debug("EFI random seed already used, not using again."); + return 0; + } + + r = efi_get_variable(EFI_VENDOR_LOADER, "LoaderRandomSeed", NULL, &value, &size); + if (r == -EOPNOTSUPP) { + log_debug_errno(r, "System lacks EFI support, not initializing random seed from EFI variable."); + return 0; + } + if (r == -ENOENT) { + log_debug_errno(r, "Boot loader did not pass LoaderRandomSeed EFI variable, not crediting any entropy."); + return 0; + } + if (r < 0) + return log_warning_errno(r, "Failed to read LoaderRandomSeed EFI variable, ignoring: %m"); + + if (size == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Random seed passed from boot loader has zero size? Ignoring."); + + random_fd = open("/dev/urandom", O_WRONLY|O_CLOEXEC|O_NOCTTY); + if (random_fd < 0) + return log_warning_errno(errno, "Failed to open /dev/urandom for writing, ignoring: %m"); + + /* Before we use the seed, let's mark it as used, so that we never credit it twice. Also, it's a nice + * way to let users known that we successfully acquired entropy from the boot laoder. */ + r = touch("/run/systemd/efi-random-seed-taken"); + if (r < 0) + return log_warning_errno(r, "Unable to mark EFI random seed as used, not using it: %m"); + + r = random_write_entropy(random_fd, value, size, true); + if (r < 0) + return log_warning_errno(errno, "Failed to credit entropy, ignoring: %m"); + + log_info("Successfully credited entropy passed from boot loader."); + return 1; +} diff --git a/src/core/efi-random.h b/src/core/efi-random.h new file mode 100644 index 0000000..7d20fff --- /dev/null +++ b/src/core/efi-random.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int efi_take_random_seed(void); diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c new file mode 100644 index 0000000..9e8c79e --- /dev/null +++ b/src/core/emergency-action.c @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/reboot.h> + +#include "bus-error.h" +#include "bus-util.h" +#include "emergency-action.h" +#include "raw-reboot.h" +#include "reboot-util.h" +#include "special.h" +#include "string-table.h" +#include "terminal-util.h" +#include "virt.h" + +static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = { + [EMERGENCY_ACTION_NONE] = "none", + [EMERGENCY_ACTION_REBOOT] = "reboot", + [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force", + [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate", + [EMERGENCY_ACTION_POWEROFF] = "poweroff", + [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force", + [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate", + [EMERGENCY_ACTION_EXIT] = "exit", + [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force", +}; + +static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) { + log_full(warn ? LOG_WARNING : LOG_DEBUG, "%s: %s", message, reason); + if (warn) + manager_status_printf(m, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, + "%s: %s", message, reason); +} + +void emergency_action( + Manager *m, + EmergencyAction action, + EmergencyActionFlags options, + const char *reboot_arg, + int exit_status, + const char *reason) { + + Unit *u; + + assert(m); + assert(action >= 0); + assert(action < _EMERGENCY_ACTION_MAX); + + /* Is the special shutdown target active or queued? If so, we are in shutdown state */ + if (IN_SET(action, EMERGENCY_ACTION_REBOOT, EMERGENCY_ACTION_POWEROFF, EMERGENCY_ACTION_EXIT)) { + u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET); + if (u && unit_active_or_pending(u)) { + log_notice("Shutdown is already active. Skipping emergency action request %s.", + emergency_action_table[action]); + return; + } + } + + if (action == EMERGENCY_ACTION_NONE) + return; + + if (FLAGS_SET(options, EMERGENCY_ACTION_IS_WATCHDOG) && !m->service_watchdogs) { + log_warning("Watchdog disabled! Not acting on: %s", reason); + return; + } + + bool warn = FLAGS_SET(options, EMERGENCY_ACTION_WARN); + + switch (action) { + + case EMERGENCY_ACTION_REBOOT: + log_and_status(m, warn, "Rebooting", reason); + + (void) update_reboot_parameter_and_warn(reboot_arg, true); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_REBOOT_FORCE: + log_and_status(m, warn, "Forcibly rebooting", reason); + + (void) update_reboot_parameter_and_warn(reboot_arg, true); + m->objective = MANAGER_REBOOT; + + break; + + case EMERGENCY_ACTION_REBOOT_IMMEDIATE: + log_and_status(m, warn, "Rebooting immediately", reason); + + sync(); + + if (!isempty(reboot_arg)) { + log_info("Rebooting with argument '%s'.", reboot_arg); + (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, reboot_arg); + log_warning_errno(errno, "Failed to reboot with parameter, retrying without: %m"); + } + + log_info("Rebooting."); + (void) reboot(RB_AUTOBOOT); + break; + + case EMERGENCY_ACTION_EXIT: + + if (exit_status >= 0) + m->return_value = exit_status; + + if (MANAGER_IS_USER(m) || detect_container() > 0) { + log_and_status(m, warn, "Exiting", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + } + + log_notice("Doing \"poweroff\" action instead of an \"exit\" emergency action."); + _fallthrough_; + + case EMERGENCY_ACTION_POWEROFF: + log_and_status(m, warn, "Powering off", reason); + (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL); + break; + + case EMERGENCY_ACTION_EXIT_FORCE: + + if (exit_status >= 0) + m->return_value = exit_status; + + if (MANAGER_IS_USER(m) || detect_container() > 0) { + log_and_status(m, warn, "Exiting immediately", reason); + m->objective = MANAGER_EXIT; + break; + } + + log_notice("Doing \"poweroff-force\" action instead of an \"exit-force\" emergency action."); + _fallthrough_; + + case EMERGENCY_ACTION_POWEROFF_FORCE: + log_and_status(m, warn, "Forcibly powering off", reason); + m->objective = MANAGER_POWEROFF; + break; + + case EMERGENCY_ACTION_POWEROFF_IMMEDIATE: + log_and_status(m, warn, "Powering off immediately", reason); + + sync(); + + log_info("Powering off."); + (void) reboot(RB_POWER_OFF); + break; + + default: + assert_not_reached("Unknown emergency action"); + } +} + +DEFINE_STRING_TABLE_LOOKUP(emergency_action, EmergencyAction); + +int parse_emergency_action( + const char *value, + bool system, + EmergencyAction *ret) { + + EmergencyAction x; + + x = emergency_action_from_string(value); + if (x < 0) + return -EINVAL; + + if (!system && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION) + return -EOPNOTSUPP; + + *ret = x; + return 0; +} diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h new file mode 100644 index 0000000..95d49a8 --- /dev/null +++ b/src/core/emergency-action.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef enum EmergencyAction { + EMERGENCY_ACTION_NONE, + EMERGENCY_ACTION_REBOOT, + EMERGENCY_ACTION_REBOOT_FORCE, + EMERGENCY_ACTION_REBOOT_IMMEDIATE, + EMERGENCY_ACTION_POWEROFF, + EMERGENCY_ACTION_POWEROFF_FORCE, + EMERGENCY_ACTION_POWEROFF_IMMEDIATE, + EMERGENCY_ACTION_EXIT, + _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT, + EMERGENCY_ACTION_EXIT_FORCE, + _EMERGENCY_ACTION_MAX, + _EMERGENCY_ACTION_INVALID = -1 +} EmergencyAction; + +typedef enum EmergencyActionFlags { + EMERGENCY_ACTION_IS_WATCHDOG = 1 << 0, + EMERGENCY_ACTION_WARN = 1 << 1, +} EmergencyActionFlags; + +#include "macro.h" +#include "manager.h" + +void emergency_action(Manager *m, + EmergencyAction action, EmergencyActionFlags options, + const char *reboot_arg, int exit_status, const char *reason); + +const char* emergency_action_to_string(EmergencyAction i) _const_; +EmergencyAction emergency_action_from_string(const char *s) _pure_; + +int parse_emergency_action(const char *value, bool system, EmergencyAction *ret); diff --git a/src/core/execute.c b/src/core/execute.c new file mode 100644 index 0000000..c992b8d --- /dev/null +++ b/src/core/execute.c @@ -0,0 +1,6512 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/personality.h> +#include <sys/prctl.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/un.h> +#include <unistd.h> +#include <utmpx.h> + +#if HAVE_PAM +#include <security/pam_appl.h> +#endif + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif + +#if HAVE_SECCOMP +#include <seccomp.h> +#endif + +#if HAVE_APPARMOR +#include <sys/apparmor.h> +#endif + +#include "sd-messages.h" + +#include "acl-util.h" +#include "af-list.h" +#include "alloc-util.h" +#if HAVE_APPARMOR +#include "apparmor-util.h" +#endif +#include "async.h" +#include "barrier.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "chown-recursive.h" +#include "cpu-set-util.h" +#include "def.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-list.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "ioprio.h" +#include "label.h" +#include "log.h" +#include "macro.h" +#include "manager.h" +#include "memory-util.h" +#include "missing_fs.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "securebits-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "smack-util.h" +#include "socket-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "unit.h" +#include "user-util.h" +#include "utmp-wtmp.h" + +#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC) +#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC) + +#define SNDBUF_SIZE (8*1024*1024) + +static int shift_fds(int fds[], size_t n_fds) { + if (n_fds <= 0) + return 0; + + /* Modifies the fds array! (sorts it) */ + + assert(fds); + + for (int start = 0;;) { + int restart_from = -1; + + for (int i = start; i < (int) n_fds; i++) { + int nfd; + + /* Already at right index? */ + if (fds[i] == i+3) + continue; + + nfd = fcntl(fds[i], F_DUPFD, i + 3); + if (nfd < 0) + return -errno; + + safe_close(fds[i]); + fds[i] = nfd; + + /* Hmm, the fd we wanted isn't free? Then + * let's remember that and try again from here */ + if (nfd != i+3 && restart_from < 0) + restart_from = i; + } + + if (restart_from < 0) + break; + + start = restart_from; + } + + return 0; +} + +static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) { + size_t n_fds; + int r; + + n_fds = n_socket_fds + n_storage_fds; + if (n_fds <= 0) + return 0; + + assert(fds); + + /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags. + * O_NONBLOCK only applies to socket activation though. */ + + for (size_t i = 0; i < n_fds; i++) { + + if (i < n_socket_fds) { + r = fd_nonblock(fds[i], nonblock); + if (r < 0) + return r; + } + + /* We unconditionally drop FD_CLOEXEC from the fds, + * since after all we want to pass these fds to our + * children */ + + r = fd_cloexec(fds[i], false); + if (r < 0) + return r; + } + + return 0; +} + +static const char *exec_context_tty_path(const ExecContext *context) { + assert(context); + + if (context->stdio_as_fds) + return NULL; + + if (context->tty_path) + return context->tty_path; + + return "/dev/console"; +} + +static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) { + const char *path; + + assert(context); + + path = exec_context_tty_path(context); + + if (context->tty_vhangup) { + if (p && p->stdin_fd >= 0) + (void) terminal_vhangup_fd(p->stdin_fd); + else if (path) + (void) terminal_vhangup(path); + } + + if (context->tty_reset) { + if (p && p->stdin_fd >= 0) + (void) reset_terminal_fd(p->stdin_fd, true); + else if (path) + (void) reset_terminal(path); + } + + if (context->tty_vt_disallocate && path) + (void) vt_disallocate(path); +} + +static bool is_terminal_input(ExecInput i) { + return IN_SET(i, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL); +} + +static bool is_terminal_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE); +} + +static bool is_kmsg_output(ExecOutput o) { + return IN_SET(o, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_KMSG_AND_CONSOLE); +} + +static bool exec_context_needs_term(const ExecContext *c) { + assert(c); + + /* Return true if the execution context suggests we should set $TERM to something useful. */ + + if (is_terminal_input(c->std_input)) + return true; + + if (is_terminal_output(c->std_output)) + return true; + + if (is_terminal_output(c->std_error)) + return true; + + return !!c->tty_path; +} + +static int open_null_as(int flags, int nfd) { + int fd; + + assert(nfd >= 0); + + fd = open("/dev/null", flags|O_NOCTTY); + if (fd < 0) + return -errno; + + return move_fd(fd, nfd, false); +} + +static int connect_journal_socket( + int fd, + const char *log_namespace, + uid_t uid, + gid_t gid) { + + union sockaddr_union sa; + socklen_t sa_len; + uid_t olduid = UID_INVALID; + gid_t oldgid = GID_INVALID; + const char *j; + int r; + + j = log_namespace ? + strjoina("/run/systemd/journal.", log_namespace, "/stdout") : + "/run/systemd/journal/stdout"; + r = sockaddr_un_set_path(&sa.un, j); + if (r < 0) + return r; + sa_len = r; + + if (gid_is_valid(gid)) { + oldgid = getgid(); + + if (setegid(gid) < 0) + return -errno; + } + + if (uid_is_valid(uid)) { + olduid = getuid(); + + if (seteuid(uid) < 0) { + r = -errno; + goto restore_gid; + } + } + + r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0; + + /* If we fail to restore the uid or gid, things will likely + fail later on. This should only happen if an LSM interferes. */ + + if (uid_is_valid(uid)) + (void) seteuid(olduid); + + restore_gid: + if (gid_is_valid(gid)) + (void) setegid(oldgid); + + return r; +} + +static int connect_logger_as( + const Unit *unit, + const ExecContext *context, + const ExecParameters *params, + ExecOutput output, + const char *ident, + int nfd, + uid_t uid, + gid_t gid) { + + _cleanup_close_ int fd = -1; + int r; + + assert(context); + assert(params); + assert(output < _EXEC_OUTPUT_MAX); + assert(ident); + assert(nfd >= 0); + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -errno; + + r = connect_journal_socket(fd, context->log_namespace, uid, gid); + if (r < 0) + return r; + + if (shutdown(fd, SHUT_RD) < 0) + return -errno; + + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + if (dprintf(fd, + "%s\n" + "%s\n" + "%i\n" + "%i\n" + "%i\n" + "%i\n" + "%i\n", + context->syslog_identifier ?: ident, + params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "", + context->syslog_priority, + !!context->syslog_level_prefix, + false, + is_kmsg_output(output), + is_terminal_output(output)) < 0) + return -errno; + + return move_fd(TAKE_FD(fd), nfd, false); +} + +static int open_terminal_as(const char *path, int flags, int nfd) { + int fd; + + assert(path); + assert(nfd >= 0); + + fd = open_terminal(path, flags | O_NOCTTY); + if (fd < 0) + return fd; + + return move_fd(fd, nfd, false); +} + +static int acquire_path(const char *path, int flags, mode_t mode) { + union sockaddr_union sa; + socklen_t sa_len; + _cleanup_close_ int fd = -1; + int r; + + assert(path); + + if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR)) + flags |= O_CREAT; + + fd = open(path, flags|O_NOCTTY, mode); + if (fd >= 0) + return TAKE_FD(fd); + + if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */ + return -errno; + + /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */ + + r = sockaddr_un_set_path(&sa.un, path); + if (r < 0) + return r == -EINVAL ? -ENXIO : r; + sa_len = r; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -errno; + + if (connect(fd, &sa.sa, sa_len) < 0) + return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have + * indication that this wasn't an AF_UNIX socket after all */ + + if ((flags & O_ACCMODE) == O_RDONLY) + r = shutdown(fd, SHUT_WR); + else if ((flags & O_ACCMODE) == O_WRONLY) + r = shutdown(fd, SHUT_RD); + else + r = 0; + if (r < 0) + return -errno; + + return TAKE_FD(fd); +} + +static int fixup_input( + const ExecContext *context, + int socket_fd, + bool apply_tty_stdin) { + + ExecInput std_input; + + assert(context); + + std_input = context->std_input; + + if (is_terminal_input(std_input) && !apply_tty_stdin) + return EXEC_INPUT_NULL; + + if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0) + return EXEC_INPUT_NULL; + + if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0) + return EXEC_INPUT_NULL; + + return std_input; +} + +static int fixup_output(ExecOutput std_output, int socket_fd) { + + if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0) + return EXEC_OUTPUT_INHERIT; + + return std_output; +} + +static int setup_input( + const ExecContext *context, + const ExecParameters *params, + int socket_fd, + const int named_iofds[static 3]) { + + ExecInput i; + + assert(context); + assert(params); + assert(named_iofds); + + if (params->stdin_fd >= 0) { + if (dup2(params->stdin_fd, STDIN_FILENO) < 0) + return -errno; + + /* Try to make this the controlling tty, if it is a tty, and reset it */ + if (isatty(STDIN_FILENO)) { + (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE); + (void) reset_terminal_fd(STDIN_FILENO, true); + } + + return STDIN_FILENO; + } + + i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); + + switch (i) { + + case EXEC_INPUT_NULL: + return open_null_as(O_RDONLY, STDIN_FILENO); + + case EXEC_INPUT_TTY: + case EXEC_INPUT_TTY_FORCE: + case EXEC_INPUT_TTY_FAIL: { + int fd; + + fd = acquire_terminal(exec_context_tty_path(context), + i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY : + i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE : + ACQUIRE_TERMINAL_WAIT, + USEC_INFINITY); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + case EXEC_INPUT_SOCKET: + assert(socket_fd >= 0); + + return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; + + case EXEC_INPUT_NAMED_FD: + assert(named_iofds[STDIN_FILENO] >= 0); + + (void) fd_nonblock(named_iofds[STDIN_FILENO], false); + return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; + + case EXEC_INPUT_DATA: { + int fd; + + fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + case EXEC_INPUT_FILE: { + bool rw; + int fd; + + assert(context->stdio_file[STDIN_FILENO]); + + rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) || + (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO])); + + fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask); + if (fd < 0) + return fd; + + return move_fd(fd, STDIN_FILENO, false); + } + + default: + assert_not_reached("Unknown input type"); + } +} + +static bool can_inherit_stderr_from_stdout( + const ExecContext *context, + ExecOutput o, + ExecOutput e) { + + assert(context); + + /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the + * stderr fd */ + + if (e == EXEC_OUTPUT_INHERIT) + return true; + if (e != o) + return false; + + if (e == EXEC_OUTPUT_NAMED_FD) + return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]); + + if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND)) + return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]); + + return true; +} + +static int setup_output( + const Unit *unit, + const ExecContext *context, + const ExecParameters *params, + int fileno, + int socket_fd, + const int named_iofds[static 3], + const char *ident, + uid_t uid, + gid_t gid, + dev_t *journal_stream_dev, + ino_t *journal_stream_ino) { + + ExecOutput o; + ExecInput i; + int r; + + assert(unit); + assert(context); + assert(params); + assert(ident); + assert(journal_stream_dev); + assert(journal_stream_ino); + + if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) { + + if (dup2(params->stdout_fd, STDOUT_FILENO) < 0) + return -errno; + + return STDOUT_FILENO; + } + + if (fileno == STDERR_FILENO && params->stderr_fd >= 0) { + if (dup2(params->stderr_fd, STDERR_FILENO) < 0) + return -errno; + + return STDERR_FILENO; + } + + i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); + o = fixup_output(context->std_output, socket_fd); + + if (fileno == STDERR_FILENO) { + ExecOutput e; + e = fixup_output(context->std_error, socket_fd); + + /* This expects the input and output are already set up */ + + /* Don't change the stderr file descriptor if we inherit all + * the way and are not on a tty */ + if (e == EXEC_OUTPUT_INHERIT && + o == EXEC_OUTPUT_INHERIT && + i == EXEC_INPUT_NULL && + !is_terminal_input(context->std_input) && + getppid () != 1) + return fileno; + + /* Duplicate from stdout if possible */ + if (can_inherit_stderr_from_stdout(context, o, e)) + return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno; + + o = e; + + } else if (o == EXEC_OUTPUT_INHERIT) { + /* If input got downgraded, inherit the original value */ + if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input)) + return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); + + /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */ + if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA)) + return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; + + /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */ + if (getppid() != 1) + return fileno; + + /* We need to open /dev/null here anew, to get the right access mode. */ + return open_null_as(O_WRONLY, fileno); + } + + switch (o) { + + case EXEC_OUTPUT_NULL: + return open_null_as(O_WRONLY, fileno); + + case EXEC_OUTPUT_TTY: + if (is_terminal_input(i)) + return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; + + /* We don't reset the terminal if this is just about output */ + return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); + + case EXEC_OUTPUT_KMSG: + case EXEC_OUTPUT_KMSG_AND_CONSOLE: + case EXEC_OUTPUT_JOURNAL: + case EXEC_OUTPUT_JOURNAL_AND_CONSOLE: + r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid); + if (r < 0) { + log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr"); + r = open_null_as(O_WRONLY, fileno); + } else { + struct stat st; + + /* If we connected this fd to the journal via a stream, patch the device/inode into the passed + * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits + * services to detect whether they are connected to the journal or not. + * + * If both stdout and stderr are connected to a stream then let's make sure to store the data + * about STDERR as that's usually the best way to do logging. */ + + if (fstat(fileno, &st) >= 0 && + (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) { + *journal_stream_dev = st.st_dev; + *journal_stream_ino = st.st_ino; + } + } + return r; + + case EXEC_OUTPUT_SOCKET: + assert(socket_fd >= 0); + + return dup2(socket_fd, fileno) < 0 ? -errno : fileno; + + case EXEC_OUTPUT_NAMED_FD: + assert(named_iofds[fileno] >= 0); + + (void) fd_nonblock(named_iofds[fileno], false); + return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno; + + case EXEC_OUTPUT_FILE: + case EXEC_OUTPUT_FILE_APPEND: { + bool rw; + int fd, flags; + + assert(context->stdio_file[fileno]); + + rw = context->std_input == EXEC_INPUT_FILE && + streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]); + + if (rw) + return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; + + flags = O_WRONLY; + if (o == EXEC_OUTPUT_FILE_APPEND) + flags |= O_APPEND; + + fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask); + if (fd < 0) + return fd; + + return move_fd(fd, fileno, 0); + } + + default: + assert_not_reached("Unknown error type"); + } +} + +static int chown_terminal(int fd, uid_t uid) { + int r; + + assert(fd >= 0); + + /* Before we chown/chmod the TTY, let's ensure this is actually a tty */ + if (isatty(fd) < 1) { + if (IN_SET(errno, EINVAL, ENOTTY)) + return 0; /* not a tty */ + + return -errno; + } + + /* This might fail. What matters are the results. */ + r = fchmod_and_chown(fd, TTY_MODE, uid, -1); + if (r < 0) + return r; + + return 1; +} + +static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) { + _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1; + int r; + + assert(_saved_stdin); + assert(_saved_stdout); + + saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3); + if (saved_stdin < 0) + return -errno; + + saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3); + if (saved_stdout < 0) + return -errno; + + fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC); + if (fd < 0) + return fd; + + r = chown_terminal(fd, getuid()); + if (r < 0) + return r; + + r = reset_terminal_fd(fd, true); + if (r < 0) + return r; + + r = rearrange_stdio(fd, fd, STDERR_FILENO); + fd = -1; + if (r < 0) + return r; + + *_saved_stdin = saved_stdin; + *_saved_stdout = saved_stdout; + + saved_stdin = saved_stdout = -1; + + return 0; +} + +static void write_confirm_error_fd(int err, int fd, const Unit *u) { + assert(err < 0); + + if (err == -ETIMEDOUT) + dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id); + else { + errno = -err; + dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id); + } +} + +static void write_confirm_error(int err, const char *vc, const Unit *u) { + _cleanup_close_ int fd = -1; + + assert(vc); + + fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return; + + write_confirm_error_fd(err, fd, u); +} + +static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) { + int r = 0; + + assert(saved_stdin); + assert(saved_stdout); + + release_terminal(); + + if (*saved_stdin >= 0) + if (dup2(*saved_stdin, STDIN_FILENO) < 0) + r = -errno; + + if (*saved_stdout >= 0) + if (dup2(*saved_stdout, STDOUT_FILENO) < 0) + r = -errno; + + *saved_stdin = safe_close(*saved_stdin); + *saved_stdout = safe_close(*saved_stdout); + + return r; +} + +enum { + CONFIRM_PRETEND_FAILURE = -1, + CONFIRM_PRETEND_SUCCESS = 0, + CONFIRM_EXECUTE = 1, +}; + +static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) { + int saved_stdout = -1, saved_stdin = -1, r; + _cleanup_free_ char *e = NULL; + char c; + + /* For any internal errors, assume a positive response. */ + r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout); + if (r < 0) { + write_confirm_error(r, vc, u); + return CONFIRM_EXECUTE; + } + + /* confirm_spawn might have been disabled while we were sleeping. */ + if (manager_is_confirm_spawn_disabled(u->manager)) { + r = 1; + goto restore_stdio; + } + + e = ellipsize(cmdline, 60, 100); + if (!e) { + log_oom(); + r = CONFIRM_EXECUTE; + goto restore_stdio; + } + + for (;;) { + r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e); + if (r < 0) { + write_confirm_error_fd(r, STDOUT_FILENO, u); + r = CONFIRM_EXECUTE; + goto restore_stdio; + } + + switch (c) { + case 'c': + printf("Resuming normal execution.\n"); + manager_disable_confirm_spawn(); + r = 1; + break; + case 'D': + unit_dump(u, stdout, " "); + continue; /* ask again */ + case 'f': + printf("Failing execution.\n"); + r = CONFIRM_PRETEND_FAILURE; + break; + case 'h': + printf(" c - continue, proceed without asking anymore\n" + " D - dump, show the state of the unit\n" + " f - fail, don't execute the command and pretend it failed\n" + " h - help\n" + " i - info, show a short summary of the unit\n" + " j - jobs, show jobs that are in progress\n" + " s - skip, don't execute the command and pretend it succeeded\n" + " y - yes, execute the command\n"); + continue; /* ask again */ + case 'i': + printf(" Description: %s\n" + " Unit: %s\n" + " Command: %s\n", + u->id, u->description, cmdline); + continue; /* ask again */ + case 'j': + manager_dump_jobs(u->manager, stdout, " "); + continue; /* ask again */ + case 'n': + /* 'n' was removed in favor of 'f'. */ + printf("Didn't understand 'n', did you mean 'f'?\n"); + continue; /* ask again */ + case 's': + printf("Skipping execution.\n"); + r = CONFIRM_PRETEND_SUCCESS; + break; + case 'y': + r = CONFIRM_EXECUTE; + break; + default: + assert_not_reached("Unhandled choice"); + } + break; + } + +restore_stdio: + restore_confirm_stdio(&saved_stdin, &saved_stdout); + return r; +} + +static int get_fixed_user(const ExecContext *c, const char **user, + uid_t *uid, gid_t *gid, + const char **home, const char **shell) { + int r; + const char *name; + + assert(c); + + if (!c->user) + return 0; + + /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway + * (i.e. are "/" or "/bin/nologin"). */ + + name = c->user; + r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN); + if (r < 0) + return r; + + *user = name; + return 0; +} + +static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) { + int r; + const char *name; + + assert(c); + + if (!c->group) + return 0; + + name = c->group; + r = get_group_creds(&name, gid, 0); + if (r < 0) + return r; + + *group = name; + return 0; +} + +static int get_supplementary_groups(const ExecContext *c, const char *user, + const char *group, gid_t gid, + gid_t **supplementary_gids, int *ngids) { + char **i; + int r, k = 0; + int ngroups_max; + bool keep_groups = false; + gid_t *groups = NULL; + _cleanup_free_ gid_t *l_gids = NULL; + + assert(c); + + /* + * If user is given, then lookup GID and supplementary groups list. + * We avoid NSS lookups for gid=0. Also we have to initialize groups + * here and as early as possible so we keep the list of supplementary + * groups of the caller. + */ + if (user && gid_is_valid(gid) && gid != 0) { + /* First step, initialize groups from /etc/groups */ + if (initgroups(user, gid) < 0) + return -errno; + + keep_groups = true; + } + + if (strv_isempty(c->supplementary_groups)) + return 0; + + /* + * If SupplementaryGroups= was passed then NGROUPS_MAX has to + * be positive, otherwise fail. + */ + errno = 0; + ngroups_max = (int) sysconf(_SC_NGROUPS_MAX); + if (ngroups_max <= 0) + return errno_or_else(EOPNOTSUPP); + + l_gids = new(gid_t, ngroups_max); + if (!l_gids) + return -ENOMEM; + + if (keep_groups) { + /* + * Lookup the list of groups that the user belongs to, we + * avoid NSS lookups here too for gid=0. + */ + k = ngroups_max; + if (getgrouplist(user, gid, l_gids, &k) < 0) + return -EINVAL; + } else + k = 0; + + STRV_FOREACH(i, c->supplementary_groups) { + const char *g; + + if (k >= ngroups_max) + return -E2BIG; + + g = *i; + r = get_group_creds(&g, l_gids+k, 0); + if (r < 0) + return r; + + k++; + } + + /* + * Sets ngids to zero to drop all supplementary groups, happens + * when we are under root and SupplementaryGroups= is empty. + */ + if (k == 0) { + *ngids = 0; + return 0; + } + + /* Otherwise get the final list of supplementary groups */ + groups = memdup(l_gids, sizeof(gid_t) * k); + if (!groups) + return -ENOMEM; + + *supplementary_gids = groups; + *ngids = k; + + groups = NULL; + + return 0; +} + +static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) { + int r; + + /* Handle SupplementaryGroups= if it is not empty */ + if (ngids > 0) { + r = maybe_setgroups(ngids, supplementary_gids); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + /* Then set our gids */ + if (setresgid(gid, gid, gid) < 0) + return -errno; + } + + return 0; +} + +static int set_securebits(int bits, int mask) { + int current, applied; + current = prctl(PR_GET_SECUREBITS); + if (current < 0) + return -errno; + /* Clear all securebits defined in mask and set bits */ + applied = (current & ~mask) | bits; + if (current == applied) + return 0; + if (prctl(PR_SET_SECUREBITS, applied) < 0) + return -errno; + return 1; +} + +static int enforce_user(const ExecContext *context, uid_t uid) { + assert(context); + int r; + + if (!uid_is_valid(uid)) + return 0; + + /* Sets (but doesn't look up) the uid and make sure we keep the + * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is + * required, so we also need keep-caps in this case. + */ + + if (context->capability_ambient_set != 0 || context->secure_bits != 0) { + + /* First step: If we need to keep capabilities but + * drop privileges we need to make sure we keep our + * caps, while we drop privileges. */ + if (uid != 0) { + /* Add KEEP_CAPS to the securebits */ + r = set_securebits(1<<SECURE_KEEP_CAPS, 0); + if (r < 0) + return r; + } + } + + /* Second step: actually set the uids */ + if (setresuid(uid, uid, uid) < 0) + return -errno; + + /* At this point we should have all necessary capabilities but + are otherwise a normal user. However, the caps might got + corrupted due to the setresuid() so we need clean them up + later. This is done outside of this call. */ + + return 0; +} + +#if HAVE_PAM + +static int null_conv( + int num_msg, + const struct pam_message **msg, + struct pam_response **resp, + void *appdata_ptr) { + + /* We don't support conversations */ + + return PAM_CONV_ERR; +} + +#endif + +static int setup_pam( + const char *name, + const char *user, + uid_t uid, + gid_t gid, + const char *tty, + char ***env, + const int fds[], size_t n_fds) { + +#if HAVE_PAM + + static const struct pam_conv conv = { + .conv = null_conv, + .appdata_ptr = NULL + }; + + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; + pam_handle_t *handle = NULL; + sigset_t old_ss; + int pam_code = PAM_SUCCESS, r; + char **nv, **e = NULL; + bool close_session = false; + pid_t pam_pid = 0, parent_pid; + int flags = 0; + + assert(name); + assert(user); + assert(env); + + /* We set up PAM in the parent process, then fork. The child + * will then stay around until killed via PR_GET_PDEATHSIG or + * systemd via the cgroup logic. It will then remove the PAM + * session again. The parent process will exec() the actual + * daemon. We do things this way to ensure that the main PID + * of the daemon is the one we initially fork()ed. */ + + r = barrier_create(&barrier); + if (r < 0) + goto fail; + + if (log_get_max_level() < LOG_DEBUG) + flags |= PAM_SILENT; + + pam_code = pam_start(name, user, &conv, &handle); + if (pam_code != PAM_SUCCESS) { + handle = NULL; + goto fail; + } + + if (!tty) { + _cleanup_free_ char *q = NULL; + + /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure + * out if that's the case, and read the TTY off it. */ + + if (getttyname_malloc(STDIN_FILENO, &q) >= 0) + tty = strjoina("/dev/", q); + } + + if (tty) { + pam_code = pam_set_item(handle, PAM_TTY, tty); + if (pam_code != PAM_SUCCESS) + goto fail; + } + + STRV_FOREACH(nv, *env) { + pam_code = pam_putenv(handle, *nv); + if (pam_code != PAM_SUCCESS) + goto fail; + } + + pam_code = pam_acct_mgmt(handle, flags); + if (pam_code != PAM_SUCCESS) + goto fail; + + pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags); + if (pam_code != PAM_SUCCESS) + log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code)); + + pam_code = pam_open_session(handle, flags); + if (pam_code != PAM_SUCCESS) + goto fail; + + close_session = true; + + e = pam_getenvlist(handle); + if (!e) { + pam_code = PAM_BUF_ERR; + goto fail; + } + + /* Block SIGTERM, so that we know that it won't get lost in + * the child */ + + assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0); + + parent_pid = getpid_cached(); + + r = safe_fork("(sd-pam)", 0, &pam_pid); + if (r < 0) + goto fail; + if (r == 0) { + int sig, ret = EXIT_PAM; + + /* The child's job is to reset the PAM session on + * termination */ + barrier_set_role(&barrier, BARRIER_CHILD); + + /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds + * are open here that have been opened by PAM. */ + (void) close_many(fds, n_fds); + + /* Drop privileges - we don't need any to pam_close_session + * and this will make PR_SET_PDEATHSIG work in most cases. + * If this fails, ignore the error - but expect sd-pam threads + * to fail to exit normally */ + + r = maybe_setgroups(0, NULL); + if (r < 0) + log_warning_errno(r, "Failed to setgroups() in sd-pam: %m"); + if (setresgid(gid, gid, gid) < 0) + log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); + if (setresuid(uid, uid, uid) < 0) + log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m"); + + (void) ignore_signals(SIGPIPE, -1); + + /* Wait until our parent died. This will only work if + * the above setresuid() succeeds, otherwise the kernel + * will not allow unprivileged parents kill their privileged + * children this way. We rely on the control groups kill logic + * to do the rest for us. */ + if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0) + goto child_finish; + + /* Tell the parent that our setup is done. This is especially + * important regarding dropping privileges. Otherwise, unit + * setup might race against our setresuid(2) call. + * + * If the parent aborted, we'll detect this below, hence ignore + * return failure here. */ + (void) barrier_place(&barrier); + + /* Check if our parent process might already have died? */ + if (getppid() == parent_pid) { + sigset_t ss; + + assert_se(sigemptyset(&ss) >= 0); + assert_se(sigaddset(&ss, SIGTERM) >= 0); + + for (;;) { + if (sigwait(&ss, &sig) < 0) { + if (errno == EINTR) + continue; + + goto child_finish; + } + + assert(sig == SIGTERM); + break; + } + } + + pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags); + if (pam_code != PAM_SUCCESS) + goto child_finish; + + /* If our parent died we'll end the session */ + if (getppid() != parent_pid) { + pam_code = pam_close_session(handle, flags); + if (pam_code != PAM_SUCCESS) + goto child_finish; + } + + ret = 0; + + child_finish: + pam_end(handle, pam_code | flags); + _exit(ret); + } + + barrier_set_role(&barrier, BARRIER_PARENT); + + /* If the child was forked off successfully it will do all the + * cleanups, so forget about the handle here. */ + handle = NULL; + + /* Unblock SIGTERM again in the parent */ + assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0); + + /* We close the log explicitly here, since the PAM modules + * might have opened it, but we don't want this fd around. */ + closelog(); + + /* Synchronously wait for the child to initialize. We don't care for + * errors as we cannot recover. However, warn loudly if it happens. */ + if (!barrier_place_and_sync(&barrier)) + log_error("PAM initialization failed"); + + return strv_free_and_replace(*env, e); + +fail: + if (pam_code != PAM_SUCCESS) { + log_error("PAM failed: %s", pam_strerror(handle, pam_code)); + r = -EPERM; /* PAM errors do not map to errno */ + } else + log_error_errno(r, "PAM failed: %m"); + + if (handle) { + if (close_session) + pam_code = pam_close_session(handle, flags); + + pam_end(handle, pam_code | flags); + } + + strv_free(e); + closelog(); + + return r; +#else + return 0; +#endif +} + +static void rename_process_from_path(const char *path) { + char process_name[11]; + const char *p; + size_t l; + + /* This resulting string must fit in 10 chars (i.e. the length + * of "/sbin/init") to look pretty in /bin/ps */ + + p = basename(path); + if (isempty(p)) { + rename_process("(...)"); + return; + } + + l = strlen(p); + if (l > 8) { + /* The end of the process name is usually more + * interesting, since the first bit might just be + * "systemd-" */ + p = p + l - 8; + l = 8; + } + + process_name[0] = '('; + memcpy(process_name+1, p, l); + process_name[1+l] = ')'; + process_name[1+l+1] = 0; + + rename_process(process_name); +} + +static bool context_has_address_families(const ExecContext *c) { + assert(c); + + return c->address_families_allow_list || + !set_isempty(c->address_families); +} + +static bool context_has_syscall_filters(const ExecContext *c) { + assert(c); + + return c->syscall_allow_list || + !hashmap_isempty(c->syscall_filter); +} + +static bool context_has_syscall_logs(const ExecContext *c) { + assert(c); + + return c->syscall_log_allow_list || + !hashmap_isempty(c->syscall_log); +} + +static bool context_has_no_new_privileges(const ExecContext *c) { + assert(c); + + if (c->no_new_privileges) + return true; + + if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ + return false; + + /* We need NNP if we have any form of seccomp and are unprivileged */ + return context_has_address_families(c) || + c->memory_deny_write_execute || + c->restrict_realtime || + c->restrict_suid_sgid || + exec_context_restrict_namespaces_set(c) || + c->protect_clock || + c->protect_kernel_tunables || + c->protect_kernel_modules || + c->protect_kernel_logs || + c->private_devices || + context_has_syscall_filters(c) || + context_has_syscall_logs(c) || + !set_isempty(c->syscall_archs) || + c->lock_personality || + c->protect_hostname; +} + +static bool exec_context_has_credentials(const ExecContext *context) { + + assert(context); + + return !hashmap_isempty(context->set_credentials) || + context->load_credentials; +} + +#if HAVE_SECCOMP + +static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { + + if (is_seccomp_available()) + return false; + + log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); + return true; +} + +static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) { + uint32_t negative_action, default_action, action; + int r; + + assert(u); + assert(c); + + if (!context_has_syscall_filters(c)) + return 0; + + if (skip_seccomp_unavailable(u, "SystemCallFilter=")) + return 0; + + negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno); + + if (c->syscall_allow_list) { + default_action = negative_action; + action = SCMP_ACT_ALLOW; + } else { + default_action = SCMP_ACT_ALLOW; + action = negative_action; + } + + if (needs_ambient_hack) { + r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID); + if (r < 0) + return r; + } + + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false); +} + +static int apply_syscall_log(const Unit* u, const ExecContext *c) { +#ifdef SCMP_ACT_LOG + uint32_t default_action, action; +#endif + + assert(u); + assert(c); + + if (!context_has_syscall_logs(c)) + return 0; + +#ifdef SCMP_ACT_LOG + if (skip_seccomp_unavailable(u, "SystemCallLog=")) + return 0; + + if (c->syscall_log_allow_list) { + /* Log nothing but the ones listed */ + default_action = SCMP_ACT_ALLOW; + action = SCMP_ACT_LOG; + } else { + /* Log everything but the ones listed */ + default_action = SCMP_ACT_LOG; + action = SCMP_ACT_ALLOW; + } + + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false); +#else + /* old libseccomp */ + log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog="); + return 0; +#endif +} + +static int apply_syscall_archs(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + if (set_isempty(c->syscall_archs)) + return 0; + + if (skip_seccomp_unavailable(u, "SystemCallArchitectures=")) + return 0; + + return seccomp_restrict_archs(c->syscall_archs); +} + +static int apply_address_families(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!context_has_address_families(c)) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) + return 0; + + return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list); +} + +static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->memory_deny_write_execute) + return 0; + + if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) + return 0; + + return seccomp_memory_deny_write_execute(); +} + +static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->restrict_realtime) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictRealtime=")) + return 0; + + return seccomp_restrict_realtime(); +} + +static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->restrict_suid_sgid) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictSUIDSGID=")) + return 0; + + return seccomp_restrict_suid_sgid(); +} + +static int apply_protect_sysctl(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but + * let's protect even those systems where this is left on in the kernel. */ + + if (!c->protect_kernel_tunables) + return 0; + + if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) + return 0; + + return seccomp_protect_sysctl(); +} + +static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + /* Turn off module syscalls on ProtectKernelModules=yes */ + + if (!c->protect_kernel_modules) + return 0; + + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->protect_kernel_logs) + return 0; + + if (skip_seccomp_unavailable(u, "ProtectKernelLogs=")) + return 0; + + return seccomp_protect_syslog(); +} + +static int apply_protect_clock(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + if (!c->protect_clock) + return 0; + + if (skip_seccomp_unavailable(u, "ProtectClock=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_private_devices(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ + + if (!c->private_devices) + return 0; + + if (skip_seccomp_unavailable(u, "PrivateDevices=")) + return 0; + + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false); +} + +static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) { + assert(u); + assert(c); + + if (!exec_context_restrict_namespaces_set(c)) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictNamespaces=")) + return 0; + + return seccomp_restrict_namespaces(c->restrict_namespaces); +} + +static int apply_lock_personality(const Unit* u, const ExecContext *c) { + unsigned long personality; + int r; + + assert(u); + assert(c); + + if (!c->lock_personality) + return 0; + + if (skip_seccomp_unavailable(u, "LockPersonality=")) + return 0; + + personality = c->personality; + + /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */ + if (personality == PERSONALITY_INVALID) { + + r = opinionated_personality(&personality); + if (r < 0) + return r; + } + + return seccomp_lock_personality(personality); +} + +#endif + +static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) { + assert(u); + assert(c); + + if (!c->protect_hostname) + return 0; + + if (ns_type_supported(NAMESPACE_UTS)) { + if (unshare(CLONE_NEWUTS) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) { + *ret_exit_status = EXIT_NAMESPACE; + return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m"); + } + + log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup."); + } + } else + log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup."); + +#if HAVE_SECCOMP + int r; + + if (skip_seccomp_unavailable(u, "ProtectHostname=")) + return 0; + + r = seccomp_protect_hostname(); + if (r < 0) { + *ret_exit_status = EXIT_SECCOMP; + return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m"); + } +#endif + + return 0; +} + +static void do_idle_pipe_dance(int idle_pipe[static 4]) { + assert(idle_pipe); + + idle_pipe[1] = safe_close(idle_pipe[1]); + idle_pipe[2] = safe_close(idle_pipe[2]); + + if (idle_pipe[0] >= 0) { + int r; + + r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC); + + if (idle_pipe[3] >= 0 && r == 0 /* timeout */) { + ssize_t n; + + /* Signal systemd that we are bored and want to continue. */ + n = write(idle_pipe[3], "x", 1); + if (n > 0) + /* Wait for systemd to react to the signal above. */ + (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC); + } + + idle_pipe[0] = safe_close(idle_pipe[0]); + + } + + idle_pipe[3] = safe_close(idle_pipe[3]); +} + +static const char *exec_directory_env_name_to_string(ExecDirectoryType t); + +static int build_environment( + const Unit *u, + const ExecContext *c, + const ExecParameters *p, + size_t n_fds, + const char *home, + const char *username, + const char *shell, + dev_t journal_stream_dev, + ino_t journal_stream_ino, + char ***ret) { + + _cleanup_strv_free_ char **our_env = NULL; + size_t n_env = 0; + char *x; + + assert(u); + assert(c); + assert(p); + assert(ret); + +#define N_ENV_VARS 16 + our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); + if (!our_env) + return -ENOMEM; + + if (n_fds > 0) { + _cleanup_free_ char *joined = NULL; + + if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + joined = strv_join(p->fd_names, ":"); + if (!joined) + return -ENOMEM; + + x = strjoin("LISTEN_FDNAMES=", joined); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) { + if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + our_env[n_env++] = x; + + if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0) + return -ENOMEM; + our_env[n_env++] = x; + } + + /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic + * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but + * check the database directly. */ + if (p->flags & EXEC_NSS_BYPASS_BUS) { + x = strdup("SYSTEMD_NSS_BYPASS_BUS=1"); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (home) { + x = strjoin("HOME=", home); + if (!x) + return -ENOMEM; + + path_simplify(x + 5, true); + our_env[n_env++] = x; + } + + if (username) { + x = strjoin("LOGNAME=", username); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + + x = strjoin("USER=", username); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (shell) { + x = strjoin("SHELL=", shell); + if (!x) + return -ENOMEM; + + path_simplify(x + 6, true); + our_env[n_env++] = x; + } + + if (!sd_id128_is_null(u->invocation_id)) { + if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (exec_context_needs_term(c)) { + const char *tty_path, *term = NULL; + + tty_path = exec_context_tty_path(c); + + /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try + * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the + * container manager passes to PID 1 ends up all the way in the console login shown. */ + + if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1) + term = getenv("TERM"); + + if (!term) + term = default_term_for_tty(tty_path); + + x = strjoin("TERM=", term); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + + if (journal_stream_dev != 0 && journal_stream_ino != 0) { + if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (c->log_namespace) { + x = strjoin("LOG_NAMESPACE=", c->log_namespace); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + _cleanup_free_ char *pre = NULL, *joined = NULL; + const char *n; + + if (!p->prefix[t]) + continue; + + if (strv_isempty(c->directories[t].paths)) + continue; + + n = exec_directory_env_name_to_string(t); + if (!n) + continue; + + pre = strjoin(p->prefix[t], "/"); + if (!pre) + return -ENOMEM; + + joined = strv_join_full(c->directories[t].paths, ":", pre, true); + if (!joined) + return -ENOMEM; + + x = strjoin(n, "=", joined); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) { + x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + + our_env[n_env++] = NULL; + assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); +#undef N_ENV_VARS + + *ret = TAKE_PTR(our_env); + + return 0; +} + +static int build_pass_environment(const ExecContext *c, char ***ret) { + _cleanup_strv_free_ char **pass_env = NULL; + size_t n_env = 0, n_bufsize = 0; + char **i; + + STRV_FOREACH(i, c->pass_environment) { + _cleanup_free_ char *x = NULL; + char *v; + + v = getenv(*i); + if (!v) + continue; + x = strjoin(*i, "=", v); + if (!x) + return -ENOMEM; + + if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2)) + return -ENOMEM; + + pass_env[n_env++] = TAKE_PTR(x); + pass_env[n_env] = NULL; + } + + *ret = TAKE_PTR(pass_env); + + return 0; +} + +static bool exec_needs_mount_namespace( + const ExecContext *context, + const ExecParameters *params, + const ExecRuntime *runtime) { + + assert(context); + assert(params); + + if (context->root_image) + return true; + + if (!strv_isempty(context->read_write_paths) || + !strv_isempty(context->read_only_paths) || + !strv_isempty(context->inaccessible_paths)) + return true; + + if (context->n_bind_mounts > 0) + return true; + + if (context->n_temporary_filesystems > 0) + return true; + + if (context->n_mount_images > 0) + return true; + + if (!IN_SET(context->mount_flags, 0, MS_SHARED)) + return true; + + if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir)) + return true; + + if (context->private_devices || + context->private_mounts || + context->protect_system != PROTECT_SYSTEM_NO || + context->protect_home != PROTECT_HOME_NO || + context->protect_kernel_tunables || + context->protect_kernel_modules || + context->protect_kernel_logs || + context->protect_control_groups || + context->protect_proc != PROTECT_PROC_DEFAULT || + context->proc_subset != PROC_SUBSET_ALL) + return true; + + if (context->root_directory) { + if (exec_context_get_effective_mount_apivfs(context)) + return true; + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + if (!strv_isempty(context->directories[t].paths)) + return true; + } + } + + if (context->dynamic_user && + (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths))) + return true; + + if (context->log_namespace) + return true; + + return false; +} + +static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { + _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; + _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 }; + _cleanup_close_ int unshare_ready_fd = -1; + _cleanup_(sigkill_waitp) pid_t pid = 0; + uint64_t c = 1; + ssize_t n; + int r; + + /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e. + * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to + * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which + * we however lack after opening the user namespace. To work around this we fork() a temporary child process, + * which waits for the parent to create the new user namespace while staying in the original namespace. The + * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and + * continues execution normally. + * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it + * does not need CAP_SETUID to write the single line mapping to itself. */ + + /* Can only set up multiple mappings with CAP_SETUID. */ + if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid)) + r = asprintf(&uid_map, + UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */ + UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ + ouid, ouid, uid, uid); + else + r = asprintf(&uid_map, + UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */ + ouid, ouid); + + if (r < 0) + return -ENOMEM; + + /* Can only set up multiple mappings with CAP_SETGID. */ + if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid)) + r = asprintf(&gid_map, + GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */ + GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */ + ogid, ogid, gid, gid); + else + r = asprintf(&gid_map, + GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */ + ogid, ogid); + + if (r < 0) + return -ENOMEM; + + /* Create a communication channel so that the parent can tell the child when it finished creating the user + * namespace. */ + unshare_ready_fd = eventfd(0, EFD_CLOEXEC); + if (unshare_ready_fd < 0) + return -errno; + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return -errno; + + r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid); + if (r < 0) + return r; + if (r == 0) { + _cleanup_close_ int fd = -1; + const char *a; + pid_t ppid; + + /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from + * here, after the parent opened its own user namespace. */ + + ppid = getppid(); + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Wait until the parent unshared the user namespace */ + if (read(unshare_ready_fd, &c, sizeof(c)) < 0) { + r = -errno; + goto child_fail; + } + + /* Disable the setgroups() system call in the child user namespace, for good. */ + a = procfs_file_alloca(ppid, "setgroups"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) { + r = -errno; + goto child_fail; + } + + /* If the file is missing the kernel is too old, let's continue anyway. */ + } else { + if (write(fd, "deny\n", 5) < 0) { + r = -errno; + goto child_fail; + } + + fd = safe_close(fd); + } + + /* First write the GID map */ + a = procfs_file_alloca(ppid, "gid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, gid_map, strlen(gid_map)) < 0) { + r = -errno; + goto child_fail; + } + fd = safe_close(fd); + + /* The write the UID map */ + a = procfs_file_alloca(ppid, "uid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, uid_map, strlen(uid_map)) < 0) { + r = -errno; + goto child_fail; + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + if (unshare(CLONE_NEWUSER) < 0) + return -errno; + + /* Let the child know that the namespace is ready now */ + if (write(unshare_ready_fd, &c, sizeof(c)) < 0) + return -errno; + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return -errno; + if (n == sizeof(r)) { /* an error code was sent to us */ + if (r < 0) + return r; + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate_and_check("(sd-userns)", pid, 0); + pid = 0; + if (r < 0) + return r; + if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */ + return -EIO; + + return 0; +} + +static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) { + if (!context->dynamic_user) + return false; + + if (type == EXEC_DIRECTORY_CONFIGURATION) + return false; + + if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO) + return false; + + return true; +} + +static int setup_exec_directory( + const ExecContext *context, + const ExecParameters *params, + uid_t uid, + gid_t gid, + ExecDirectoryType type, + int *exit_status) { + + static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY, + [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY, + [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY, + [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY, + [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY, + }; + char **rt; + int r; + + assert(context); + assert(params); + assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX); + assert(exit_status); + + if (!params->prefix[type]) + return 0; + + if (params->flags & EXEC_CHOWN_DIRECTORIES) { + if (!uid_is_valid(uid)) + uid = 0; + if (!gid_is_valid(gid)) + gid = 0; + } + + STRV_FOREACH(rt, context->directories[type].paths) { + _cleanup_free_ char *p = NULL, *pp = NULL; + + p = path_join(params->prefix[type], *rt); + if (!p) { + r = -ENOMEM; + goto fail; + } + + r = mkdir_parents_label(p, 0755); + if (r < 0) + goto fail; + + if (exec_directory_is_private(context, type)) { + _cleanup_free_ char *private_root = NULL; + + /* So, here's one extra complication when dealing with DynamicUser=1 units. In that + * case we want to avoid leaving a directory around fully accessible that is owned by + * a dynamic user whose UID is later on reused. To lock this down we use the same + * trick used by container managers to prohibit host users to get access to files of + * the same UID in containers: we place everything inside a directory that has an + * access mode of 0700 and is owned root:root, so that it acts as security boundary + * for unprivileged host code. We then use fs namespacing to make this directory + * permeable for the service itself. + * + * Specifically: for a service which wants a special directory "foo/" we first create + * a directory "private/" with access mode 0700 owned by root:root. Then we place + * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to + * "private/foo". This way, privileged host users can access "foo/" as usual, but + * unprivileged host users can't look into it. Inside of the namespace of the unit + * "private/" is replaced by a more liberally accessible tmpfs, into which the host's + * "private/foo/" is mounted under the same name, thus disabling the access boundary + * for the service and making sure it only gets access to the dirs it needs but no + * others. Tricky? Yes, absolutely, but it works! + * + * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not + * to be owned by the service itself. + * + * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used + * for sharing files or sockets with other services. */ + + private_root = path_join(params->prefix[type], "private"); + if (!private_root) { + r = -ENOMEM; + goto fail; + } + + /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */ + r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + pp = path_join(private_root, *rt); + if (!pp) { + r = -ENOMEM; + goto fail; + } + + /* Create all directories between the configured directory and this private root, and mark them 0755 */ + r = mkdir_parents_label(pp, 0755); + if (r < 0) + goto fail; + + if (is_dir(p, false) > 0 && + (laccess(pp, F_OK) < 0 && errno == ENOENT)) { + + /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move + * it over. Most likely the service has been upgraded from one that didn't use + * DynamicUser=1, to one that does. */ + + log_info("Found pre-existing public %s= directory %s, migrating to %s.\n" + "Apparently, service previously had DynamicUser= turned off, and has now turned it on.", + exec_directory_type_to_string(type), p, pp); + + if (rename(p, pp) < 0) { + r = -errno; + goto fail; + } + } else { + /* Otherwise, create the actual directory for the service */ + + r = mkdir_label(pp, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + } + + /* And link it up from the original place */ + r = symlink_idempotent(pp, p, true); + if (r < 0) + goto fail; + + } else { + _cleanup_free_ char *target = NULL; + + if (type != EXEC_DIRECTORY_CONFIGURATION && + readlink_and_make_absolute(p, &target) >= 0) { + _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL; + + /* This already exists and is a symlink? Interesting. Maybe it's one created + * by DynamicUser=1 (see above)? + * + * We do this for all directory types except for ConfigurationDirectory=, + * since they all support the private/ symlink logic at least in some + * configurations, see above. */ + + r = chase_symlinks(target, NULL, 0, &target_resolved, NULL); + if (r < 0) + goto fail; + + q = path_join(params->prefix[type], "private", *rt); + if (!q) { + r = -ENOMEM; + goto fail; + } + + /* /var/lib or friends may be symlinks. So, let's chase them also. */ + r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL); + if (r < 0) + goto fail; + + if (path_equal(q_resolved, target_resolved)) { + + /* Hmm, apparently DynamicUser= was once turned on for this service, + * but is no longer. Let's move the directory back up. */ + + log_info("Found pre-existing private %s= directory %s, migrating to %s.\n" + "Apparently, service previously had DynamicUser= turned on, and has now turned it off.", + exec_directory_type_to_string(type), q, p); + + if (unlink(p) < 0) { + r = -errno; + goto fail; + } + + if (rename(q, p) < 0) { + r = -errno; + goto fail; + } + } + } + + r = mkdir_label(p, context->directories[type].mode); + if (r < 0) { + if (r != -EEXIST) + goto fail; + + if (type == EXEC_DIRECTORY_CONFIGURATION) { + struct stat st; + + /* Don't change the owner/access mode of the configuration directory, + * as in the common case it is not written to by a service, and shall + * not be writable. */ + + if (stat(p, &st) < 0) { + r = -errno; + goto fail; + } + + /* Still complain if the access mode doesn't match */ + if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0) + log_warning("%s \'%s\' already exists but the mode is different. " + "(File system: %o %sMode: %o)", + exec_directory_type_to_string(type), *rt, + st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777); + + continue; + } + } + } + + /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't + * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the + * current UID/GID ownership.) */ + r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID); + if (r < 0) + goto fail; + + /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we + * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID + * assignments to exist.*/ + r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777); + if (r < 0) + goto fail; + } + + return 0; + +fail: + *exit_status = exit_status_table[type]; + return r; +} + +static int write_credential( + int dfd, + const char *id, + const void *data, + size_t size, + uid_t uid, + bool ownership_ok) { + + _cleanup_(unlink_and_freep) char *tmp = NULL; + _cleanup_close_ int fd = -1; + int r; + + r = tempfn_random_child("", "cred", &tmp); + if (r < 0) + return r; + + fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600); + if (fd < 0) { + tmp = mfree(tmp); + return -errno; + } + + r = loop_write(fd, data, size, /* do_pool = */ false); + if (r < 0) + return r; + + if (fchmod(fd, 0400) < 0) /* Take away "w" bit */ + return -errno; + + if (uid_is_valid(uid) && uid != getuid()) { + r = fd_add_uid_acl_permission(fd, uid, ACL_READ); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + + if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want + * to express: that the user gets read access and nothing + * else. But if the backing fs can't support that (e.g. ramfs) + * then we can use file ownership instead. But that's only safe if + * we can then re-mount the whole thing read-only, so that the + * user can no longer chmod() the file to gain write access. */ + return r; + + if (fchown(fd, uid, (gid_t) -1) < 0) + return -errno; + } + } + + if (renameat(dfd, tmp, dfd, id) < 0) + return -errno; + + tmp = mfree(tmp); + return 0; +} + +#define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */ + +static int acquire_credentials( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + const char *p, + uid_t uid, + bool ownership_ok) { + + uint64_t left = CREDENTIALS_BYTES_MAX; + _cleanup_close_ int dfd = -1; + ExecSetCredential *sc; + char **id, **fn; + int r; + + assert(context); + assert(p); + + dfd = open(p, O_DIRECTORY|O_CLOEXEC); + if (dfd < 0) + return -errno; + + /* First we use the literally specified credentials. Note that they might be overridden again below, + * and thus act as a "default" if the same credential is specified multiple times */ + HASHMAP_FOREACH(sc, context->set_credentials) { + size_t add; + + add = strlen(sc->id) + sc->size; + if (add > left) + return -E2BIG; + + r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok); + if (r < 0) + return r; + + left -= add; + } + + /* Then, load credential off disk (or acquire via AF_UNIX socket) */ + STRV_FOREACH_PAIR(id, fn, context->load_credentials) { + ReadFullFileFlags flags = READ_FULL_FILE_SECURE; + _cleanup_(erase_and_freep) char *data = NULL; + _cleanup_free_ char *j = NULL, *bindname = NULL; + const char *source; + size_t size, add; + + if (path_is_absolute(*fn)) { + /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */ + source = *fn; + flags |= READ_FULL_FILE_CONNECT_SOCKET; + + /* Pass some minimal info about the unit and the credential name we are looking to acquire + * via the source socket address in case we read off an AF_UNIX socket. */ + if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0) + return -ENOMEM; + + } else if (params->received_credentials) { + /* If this is a relative path, take it relative to the credentials we received + * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating + * on a credential store, i.e. this is guaranteed to be regular files. */ + j = path_join(params->received_credentials, *fn); + if (!j) + return -ENOMEM; + + source = j; + } else + source = NULL; + + + if (source) + r = read_full_file_full(AT_FDCWD, source, flags, bindname, &data, &size); + else + r = -ENOENT; + if (r == -ENOENT && + faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) /* If the source file doesn't exist, but we already acquired the key otherwise, then don't fail */ + continue; + if (r < 0) + return r; + + add = strlen(*id) + size; + if (add > left) + return -E2BIG; + + r = write_credential(dfd, *id, data, size, uid, ownership_ok); + if (r < 0) + return r; + + left -= add; + } + + if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */ + return -errno; + + /* After we created all keys with the right perms, also make sure the credential store as a whole is + * accessible */ + + if (uid_is_valid(uid) && uid != getuid()) { + r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + + if (!ownership_ok) + return r; + + if (fchown(dfd, uid, (gid_t) -1) < 0) + return -errno; + } + } + + return 0; +} + +static int setup_credentials_internal( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + const char *final, /* This is where the credential store shall eventually end up at */ + const char *workspace, /* This is where we can prepare it before moving it to the final place */ + bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */ + bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */ + uid_t uid) { + + int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true + * if we mounted something; false if we definitely can't mount anything */ + bool final_mounted; + const char *where; + + assert(context); + assert(final); + assert(workspace); + + if (reuse_workspace) { + r = path_is_mount_point(workspace, NULL, 0); + if (r < 0) + return r; + if (r > 0) + workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */ + else + workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */ + } else + workspace_mounted = -1; /* ditto */ + + r = path_is_mount_point(final, NULL, 0); + if (r < 0) + return r; + if (r > 0) { + /* If the final place already has something mounted, we use that. If the workspace also has + * something mounted we assume it's actually the same mount (but with MS_RDONLY + * different). */ + final_mounted = true; + + if (workspace_mounted < 0) { + /* If the final place is mounted, but the workspace we isn't, then let's bind mount + * the final version to the workspace, and make it writable, so that we can make + * changes */ + + r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL); + if (r < 0) + return r; + + workspace_mounted = true; + } + } else + final_mounted = false; + + if (workspace_mounted < 0) { + /* Nothing is mounted on the workspace yet, let's try to mount something now */ + for (int try = 0;; try++) { + + if (try == 0) { + /* Try "ramfs" first, since it's not swap backed */ + r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700"); + if (r >= 0) { + workspace_mounted = true; + break; + } + + } else if (try == 1) { + _cleanup_free_ char *opts = NULL; + + if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0) + return -ENOMEM; + + /* Fall back to "tmpfs" otherwise */ + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts); + if (r >= 0) { + workspace_mounted = true; + break; + } + + } else { + /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */ + r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */ + return r; + + if (must_mount) /* If we it's not OK to use the plain directory + * fallback, propagate all errors too */ + return r; + + /* If we lack privileges to bind mount stuff, then let's gracefully + * proceed for compat with container envs, and just use the final dir + * as is. */ + + workspace_mounted = false; + break; + } + + /* Make the new bind mount writable (i.e. drop MS_RDONLY) */ + r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL); + if (r < 0) + return r; + + workspace_mounted = true; + break; + } + } + } + + assert(!must_mount || workspace_mounted > 0); + where = workspace_mounted ? workspace : final; + + r = acquire_credentials(context, params, unit, where, uid, workspace_mounted); + if (r < 0) + return r; + + if (workspace_mounted) { + /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */ + r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL); + if (r < 0) + return r; + + /* And mount it to the final place, read-only */ + if (final_mounted) + r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW); + else + r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL); + if (r < 0) + return r; + } else { + _cleanup_free_ char *parent = NULL; + + /* If we do not have our own mount put used the plain directory fallback, then we need to + * open access to the top-level credential directory and the per-service directory now */ + + parent = dirname_malloc(final); + if (!parent) + return -ENOMEM; + if (chmod(parent, 0755) < 0) + return -errno; + } + + return 0; +} + +static int setup_credentials( + const ExecContext *context, + const ExecParameters *params, + const char *unit, + uid_t uid) { + + _cleanup_free_ char *p = NULL, *q = NULL; + const char *i; + int r; + + assert(context); + assert(params); + + if (!exec_context_has_credentials(context)) + return 0; + + if (!params->prefix[EXEC_DIRECTORY_RUNTIME]) + return -EINVAL; + + /* This where we'll place stuff when we are done; this main credentials directory is world-readable, + * and the subdir we mount over with a read-only file system readable by the service's user */ + q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials"); + if (!q) + return -ENOMEM; + + r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */ + if (r < 0 && r != -EEXIST) + return r; + + p = path_join(q, unit); + if (!p) + return -ENOMEM; + + r = mkdir_label(p, 0700); /* per-unit dir: private to user */ + if (r < 0 && r != -EEXIST) + return r; + + r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL); + if (r < 0) { + _cleanup_free_ char *t = NULL, *u = NULL; + + /* If this is not a privilege or support issue then propagate the error */ + if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) + return r; + + /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving + * it into place, so that users can't access half-initialized credential stores. */ + t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials"); + if (!t) + return -ENOMEM; + + /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit + * directory outside of /run/credentials/ first, and then move it over to /run/credentials/ + * after it is fully set up */ + u = path_join(t, unit); + if (!u) + return -ENOMEM; + + FOREACH_STRING(i, t, u) { + r = mkdir_label(i, 0700); + if (r < 0 && r != -EEXIST) + return r; + } + + r = setup_credentials_internal( + context, + params, + unit, + p, /* final mount point */ + u, /* temporary workspace to overmount */ + true, /* reuse the workspace if it is already a mount */ + false, /* it's OK to fall back to a plain directory if we can't mount anything */ + uid); + + (void) rmdir(u); /* remove the workspace again if we can. */ + + if (r < 0) + return r; + + } else if (r == 0) { + + /* We managed to set up a mount namespace, and are now in a child. That's great. In this case + * we can use the same directory for all cases, after turning off propagation. Question + * though is: where do we turn off propagation exactly, and where do we place the workspace + * directory? We need some place that is guaranteed to be a mount point in the host, and + * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this, + * since we ultimately want to move the resulting file system there, i.e. we need propagation + * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that + * would be visible in the host mount table all the time, which we want to avoid. Hence, what + * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that + * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off + * propagation on the former, and then overmount the latter. + * + * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist + * for this purpose, but there are few other candidates that work equally well for us, and + * given that the we do this in a privately namespaced short-lived single-threaded process + * that no one else sees this should be OK to do.*/ + + r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */ + if (r < 0) + goto child_fail; + + r = setup_credentials_internal( + context, + params, + unit, + p, /* final mount point */ + "/dev/shm", /* temporary workspace to overmount */ + false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */ + true, /* insist that something is mounted, do not allow fallback to plain directory */ + uid); + if (r < 0) + goto child_fail; + + _exit(EXIT_SUCCESS); + + child_fail: + _exit(EXIT_FAILURE); + } + + return 0; +} + +#if ENABLE_SMACK +static int setup_smack( + const ExecContext *context, + const char *executable) { + int r; + + assert(context); + assert(executable); + + if (context->smack_process_label) { + r = mac_smack_apply_pid(0, context->smack_process_label); + if (r < 0) + return r; + } +#ifdef SMACK_DEFAULT_PROCESS_LABEL + else { + _cleanup_free_ char *exec_label = NULL; + + r = mac_smack_read(executable, SMACK_ATTR_EXEC, &exec_label); + if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP)) + return r; + + r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); + if (r < 0) + return r; + } +#endif + + return 0; +} +#endif + +static int compile_bind_mounts( + const ExecContext *context, + const ExecParameters *params, + BindMount **ret_bind_mounts, + size_t *ret_n_bind_mounts, + char ***ret_empty_directories) { + + _cleanup_strv_free_ char **empty_directories = NULL; + BindMount *bind_mounts; + size_t n, h = 0; + int r; + + assert(context); + assert(params); + assert(ret_bind_mounts); + assert(ret_n_bind_mounts); + assert(ret_empty_directories); + + n = context->n_bind_mounts; + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + n += strv_length(context->directories[t].paths); + } + + if (n <= 0) { + *ret_bind_mounts = NULL; + *ret_n_bind_mounts = 0; + *ret_empty_directories = NULL; + return 0; + } + + bind_mounts = new(BindMount, n); + if (!bind_mounts) + return -ENOMEM; + + for (size_t i = 0; i < context->n_bind_mounts; i++) { + BindMount *item = context->bind_mounts + i; + char *s, *d; + + s = strdup(item->source); + if (!s) { + r = -ENOMEM; + goto finish; + } + + d = strdup(item->destination); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = item->read_only, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + } + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **suffix; + + if (!params->prefix[t]) + continue; + + if (strv_isempty(context->directories[t].paths)) + continue; + + if (exec_directory_is_private(context, t) && + !exec_context_with_rootfs(context)) { + char *private_root; + + /* So this is for a dynamic user, and we need to make sure the process can access its own + * directory. For that we overmount the usually inaccessible "private" subdirectory with a + * tmpfs that makes it accessible and is empty except for the submounts we do this for. */ + + private_root = path_join(params->prefix[t], "private"); + if (!private_root) { + r = -ENOMEM; + goto finish; + } + + r = strv_consume(&empty_directories, private_root); + if (r < 0) + goto finish; + } + + STRV_FOREACH(suffix, context->directories[t].paths) { + char *s, *d; + + if (exec_directory_is_private(context, t)) + s = path_join(params->prefix[t], "private", *suffix); + else + s = path_join(params->prefix[t], *suffix); + if (!s) { + r = -ENOMEM; + goto finish; + } + + if (exec_directory_is_private(context, t) && + exec_context_with_rootfs(context)) + /* When RootDirectory= or RootImage= are set, then the symbolic link to the private + * directory is not created on the root directory. So, let's bind-mount the directory + * on the 'non-private' place. */ + d = path_join(params->prefix[t], *suffix); + else + d = strdup(s); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = false, + .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */ + .recursive = true, + .ignore_enoent = false, + }; + } + } + + assert(h == n); + + *ret_bind_mounts = bind_mounts; + *ret_n_bind_mounts = n; + *ret_empty_directories = TAKE_PTR(empty_directories); + + return (int) n; + +finish: + bind_mount_free_many(bind_mounts, h); + return r; +} + +static bool insist_on_sandboxing( + const ExecContext *context, + const char *root_dir, + const char *root_image, + const BindMount *bind_mounts, + size_t n_bind_mounts) { + + assert(context); + assert(n_bind_mounts == 0 || bind_mounts); + + /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that + * would alter the view on the file system beyond making things read-only or invisible, i.e. would + * rearrange stuff in a way we cannot ignore gracefully. */ + + if (context->n_temporary_filesystems > 0) + return true; + + if (root_dir || root_image) + return true; + + if (context->n_mount_images > 0) + return true; + + if (context->dynamic_user) + return true; + + /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes + * essential. */ + for (size_t i = 0; i < n_bind_mounts; i++) + if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination)) + return true; + + if (context->log_namespace) + return true; + + return false; +} + +static int apply_mount_namespace( + const Unit *u, + ExecCommandFlags command_flags, + const ExecContext *context, + const ExecParameters *params, + const ExecRuntime *runtime, + char **error_path) { + + _cleanup_strv_free_ char **empty_directories = NULL; + const char *tmp_dir = NULL, *var_tmp_dir = NULL; + const char *root_dir = NULL, *root_image = NULL; + _cleanup_free_ char *creds_path = NULL; + NamespaceInfo ns_info; + bool needs_sandboxing; + BindMount *bind_mounts = NULL; + size_t n_bind_mounts = 0; + int r; + + assert(context); + + if (params->flags & EXEC_APPLY_CHROOT) { + root_image = context->root_image; + + if (!root_image) + root_dir = context->root_directory; + } + + r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); + if (r < 0) + return r; + + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED); + if (needs_sandboxing) { + /* The runtime struct only contains the parent of the private /tmp, + * which is non-accessible to world users. Inside of it there's a /tmp + * that is sticky, and that's the one we want to use here. + * This does not apply when we are using /run/systemd/empty as fallback. */ + + if (context->private_tmp && runtime) { + if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY)) + tmp_dir = runtime->tmp_dir; + else if (runtime->tmp_dir) + tmp_dir = strjoina(runtime->tmp_dir, "/tmp"); + + if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY)) + var_tmp_dir = runtime->var_tmp_dir; + else if (runtime->var_tmp_dir) + var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp"); + } + + ns_info = (NamespaceInfo) { + .ignore_protect_paths = false, + .private_dev = context->private_devices, + .protect_control_groups = context->protect_control_groups, + .protect_kernel_tunables = context->protect_kernel_tunables, + .protect_kernel_modules = context->protect_kernel_modules, + .protect_kernel_logs = context->protect_kernel_logs, + .protect_hostname = context->protect_hostname, + .mount_apivfs = exec_context_get_effective_mount_apivfs(context), + .private_mounts = context->private_mounts, + .protect_home = context->protect_home, + .protect_system = context->protect_system, + .protect_proc = context->protect_proc, + .proc_subset = context->proc_subset, + }; + } else if (!context->dynamic_user && root_dir) + /* + * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed + * sandbox info, otherwise enforce it, don't ignore protected paths and + * fail if we are enable to apply the sandbox inside the mount namespace. + */ + ns_info = (NamespaceInfo) { + .ignore_protect_paths = true, + }; + else + ns_info = (NamespaceInfo) {}; + + if (context->mount_flags == MS_SHARED) + log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring"); + + if (exec_context_has_credentials(context) && params->prefix[EXEC_DIRECTORY_RUNTIME]) { + creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id); + if (!creds_path) { + r = -ENOMEM; + goto finalize; + } + } + + r = setup_namespace(root_dir, root_image, context->root_image_options, + &ns_info, context->read_write_paths, + needs_sandboxing ? context->read_only_paths : NULL, + needs_sandboxing ? context->inaccessible_paths : NULL, + empty_directories, + bind_mounts, + n_bind_mounts, + context->temporary_filesystems, + context->n_temporary_filesystems, + context->mount_images, + context->n_mount_images, + tmp_dir, + var_tmp_dir, + creds_path, + context->log_namespace, + context->mount_flags, + context->root_hash, context->root_hash_size, context->root_hash_path, + context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path, + context->root_verity, + DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK, + error_path); + + /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports + * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively + * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a + * completely different execution environment. */ + if (r == -ENOANO) { + if (insist_on_sandboxing( + context, + root_dir, root_image, + bind_mounts, + n_bind_mounts)) { + log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n" + "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s", + n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user)); + + r = -EOPNOTSUPP; + } else { + log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring."); + r = 0; + } + } + +finalize: + bind_mount_free_many(bind_mounts, n_bind_mounts); + return r; +} + +static int apply_working_directory( + const ExecContext *context, + const ExecParameters *params, + const char *home, + int *exit_status) { + + const char *d, *wd; + + assert(context); + assert(exit_status); + + if (context->working_directory_home) { + + if (!home) { + *exit_status = EXIT_CHDIR; + return -ENXIO; + } + + wd = home; + + } else + wd = empty_to_root(context->working_directory); + + if (params->flags & EXEC_APPLY_CHROOT) + d = wd; + else + d = prefix_roota(context->root_directory, wd); + + if (chdir(d) < 0 && !context->working_directory_missing_ok) { + *exit_status = EXIT_CHDIR; + return -errno; + } + + return 0; +} + +static int apply_root_directory( + const ExecContext *context, + const ExecParameters *params, + const bool needs_mount_ns, + int *exit_status) { + + assert(context); + assert(exit_status); + + if (params->flags & EXEC_APPLY_CHROOT) + if (!needs_mount_ns && context->root_directory) + if (chroot(context->root_directory) < 0) { + *exit_status = EXIT_CHROOT; + return -errno; + } + + return 0; +} + +static int setup_keyring( + const Unit *u, + const ExecContext *context, + const ExecParameters *p, + uid_t uid, gid_t gid) { + + key_serial_t keyring; + int r = 0; + uid_t saved_uid; + gid_t saved_gid; + + assert(u); + assert(context); + assert(p); + + /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that + * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond + * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be + * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in + * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where + * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */ + + if (context->keyring_mode == EXEC_KEYRING_INHERIT) + return 0; + + /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up + * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel + * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user + * & group is just as nasty as acquiring a reference to the user keyring. */ + + saved_uid = getuid(); + saved_gid = getgid(); + + if (gid_is_valid(gid) && gid != saved_gid) { + if (setregid(gid, -1) < 0) + return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m"); + } + + if (uid_is_valid(uid) && uid != saved_uid) { + if (setreuid(uid, -1) < 0) { + r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m"); + goto out; + } + } + + keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0); + if (keyring == -1) { + if (errno == ENOSYS) + log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring."); + else if (ERRNO_IS_PRIVILEGE(errno)) + log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring."); + else if (errno == EDQUOT) + log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring."); + else + r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m"); + + goto out; + } + + /* When requested link the user keyring into the session keyring. */ + if (context->keyring_mode == EXEC_KEYRING_SHARED) { + + if (keyctl(KEYCTL_LINK, + KEY_SPEC_USER_KEYRING, + KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) { + r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m"); + goto out; + } + } + + /* Restore uid/gid back */ + if (uid_is_valid(uid) && uid != saved_uid) { + if (setreuid(saved_uid, -1) < 0) { + r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m"); + goto out; + } + } + + if (gid_is_valid(gid) && gid != saved_gid) { + if (setregid(saved_gid, -1) < 0) + return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m"); + } + + /* Populate they keyring with the invocation ID by default, as original saved_uid. */ + if (!sd_id128_is_null(u->invocation_id)) { + key_serial_t key; + + key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING); + if (key == -1) + log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m"); + else { + if (keyctl(KEYCTL_SETPERM, key, + KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH| + KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0) + r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m"); + } + } + +out: + /* Revert back uid & gid for the last time, and exit */ + /* no extra logging, as only the first already reported error matters */ + if (getuid() != saved_uid) + (void) setreuid(saved_uid, -1); + + if (getgid() != saved_gid) + (void) setregid(saved_gid, -1); + + return r; +} + +static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) { + assert(array); + assert(n); + assert(pair); + + if (pair[0] >= 0) + array[(*n)++] = pair[0]; + if (pair[1] >= 0) + array[(*n)++] = pair[1]; +} + +static int close_remaining_fds( + const ExecParameters *params, + const ExecRuntime *runtime, + const DynamicCreds *dcreds, + int user_lookup_fd, + int socket_fd, + int exec_fd, + const int *fds, size_t n_fds) { + + size_t n_dont_close = 0; + int dont_close[n_fds + 12]; + + assert(params); + + if (params->stdin_fd >= 0) + dont_close[n_dont_close++] = params->stdin_fd; + if (params->stdout_fd >= 0) + dont_close[n_dont_close++] = params->stdout_fd; + if (params->stderr_fd >= 0) + dont_close[n_dont_close++] = params->stderr_fd; + + if (socket_fd >= 0) + dont_close[n_dont_close++] = socket_fd; + if (exec_fd >= 0) + dont_close[n_dont_close++] = exec_fd; + if (n_fds > 0) { + memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); + n_dont_close += n_fds; + } + + if (runtime) + append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket); + + if (dcreds) { + if (dcreds->user) + append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket); + if (dcreds->group) + append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket); + } + + if (user_lookup_fd >= 0) + dont_close[n_dont_close++] = user_lookup_fd; + + return close_all_fds(dont_close, n_dont_close); +} + +static int send_user_lookup( + Unit *unit, + int user_lookup_fd, + uid_t uid, + gid_t gid) { + + assert(unit); + + /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID + * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was + * specified. */ + + if (user_lookup_fd < 0) + return 0; + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + if (writev(user_lookup_fd, + (struct iovec[]) { + IOVEC_INIT(&uid, sizeof(uid)), + IOVEC_INIT(&gid, sizeof(gid)), + IOVEC_INIT_STRING(unit->id) }, 3) < 0) + return -errno; + + return 0; +} + +static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) { + int r; + + assert(c); + assert(home); + assert(buf); + + /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */ + + if (*home) + return 0; + + if (!c->working_directory_home) + return 0; + + r = get_home_dir(buf); + if (r < 0) + return r; + + *home = *buf; + return 1; +} + +static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) { + _cleanup_strv_free_ char ** list = NULL; + int r; + + assert(c); + assert(p); + assert(ret); + + assert(c->dynamic_user); + + /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for + * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special + * directories. */ + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **i; + + if (t == EXEC_DIRECTORY_CONFIGURATION) + continue; + + if (!p->prefix[t]) + continue; + + STRV_FOREACH(i, c->directories[t].paths) { + char *e; + + if (exec_directory_is_private(c, t)) + e = path_join(p->prefix[t], "private", *i); + else + e = path_join(p->prefix[t], *i); + if (!e) + return -ENOMEM; + + r = strv_consume(&list, e); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(list); + + return 0; +} + +static char *exec_command_line(char **argv); + +static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) { + bool using_subcgroup; + char *p; + + assert(params); + assert(ret); + + if (!params->cgroup_path) + return -EINVAL; + + /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated + * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control + * processes started after the main unit's process in the unit's main cgroup because it is now an inner one, + * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process, + * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=, + * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre= + * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP + * flag, which is only passed for the former statements, not for the latter. */ + + using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL); + if (using_subcgroup) + p = path_join(params->cgroup_path, ".control"); + else + p = strdup(params->cgroup_path); + if (!p) + return -ENOMEM; + + *ret = p; + return using_subcgroup; +} + +static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) { + _cleanup_(cpu_set_reset) CPUSet s = {}; + int r; + + assert(c); + assert(ret); + + if (!c->numa_policy.nodes.set) { + log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring"); + return 0; + } + + r = numa_to_cpu_set(&c->numa_policy, &s); + if (r < 0) + return r; + + cpu_set_reset(ret); + + return cpu_set_add_all(ret, &s); +} + +bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) { + assert(c); + + return c->cpu_affinity_from_numa; +} + +static int exec_child( + Unit *unit, + const ExecCommand *command, + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + DynamicCreds *dcreds, + int socket_fd, + const int named_iofds[static 3], + int *fds, + size_t n_socket_fds, + size_t n_storage_fds, + char **files_env, + int user_lookup_fd, + int *exit_status) { + + _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL; + int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; + _cleanup_free_ gid_t *supplementary_gids = NULL; + const char *username = NULL, *groupname = NULL; + _cleanup_free_ char *home_buffer = NULL; + const char *home = NULL, *shell = NULL; + char **final_argv = NULL; + dev_t journal_stream_dev = 0; + ino_t journal_stream_ino = 0; + bool userns_set_up = false; + bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */ + needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */ + needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ + needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */ +#if HAVE_SELINUX + _cleanup_free_ char *mac_selinux_context_net = NULL; + bool use_selinux = false; +#endif +#if ENABLE_SMACK + bool use_smack = false; +#endif +#if HAVE_APPARMOR + bool use_apparmor = false; +#endif + uid_t saved_uid = getuid(); + gid_t saved_gid = getgid(); + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + size_t n_fds; + int secure_bits; + _cleanup_free_ gid_t *gids_after_pam = NULL; + int ngids_after_pam = 0; + + assert(unit); + assert(command); + assert(context); + assert(params); + assert(exit_status); + + rename_process_from_path(command->path); + + /* We reset exactly these signals, since they are the + * only ones we set to SIG_IGN in the main daemon. All + * others we leave untouched because we set them to + * SIG_DFL or a valid handler initially, both of which + * will be demoted to SIG_DFL. */ + (void) default_signals(SIGNALS_CRASH_HANDLER, + SIGNALS_IGNORE, -1); + + if (context->ignore_sigpipe) + (void) ignore_signals(SIGPIPE, -1); + + r = reset_signal_mask(); + if (r < 0) { + *exit_status = EXIT_SIGNAL_MASK; + return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m"); + } + + if (params->idle_pipe) + do_idle_pipe_dance(params->idle_pipe); + + /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its + * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have + * any fds open we don't really want open during the transition. In order to make logging work, we switch the + * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */ + + log_forget_fds(); + log_set_open_when_needed(true); + + /* In case anything used libc syslog(), close this here, too */ + closelog(); + + n_fds = n_socket_fds + n_storage_fds; + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); + } + + if (!context->same_pgrp && + setsid() < 0) { + *exit_status = EXIT_SETSID; + return log_unit_error_errno(unit, errno, "Failed to create new process session: %m"); + } + + exec_context_tty_reset(context, params); + + if (unit_shall_confirm_spawn(unit)) { + const char *vc = params->confirm_spawn; + _cleanup_free_ char *cmdline = NULL; + + cmdline = exec_command_line(command->argv); + if (!cmdline) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = ask_for_confirmation(vc, unit, cmdline); + if (r != CONFIRM_EXECUTE) { + if (r == CONFIRM_PRETEND_SUCCESS) { + *exit_status = EXIT_SUCCESS; + return 0; + } + *exit_status = EXIT_CONFIRM; + return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED), + "Execution cancelled by the user"); + } + } + + /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is + * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note + * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS + * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they + * might internally call into other NSS modules that are involved in hostname resolution, we never know. */ + if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 || + setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) { + *exit_status = EXIT_MEMORY; + return log_unit_error_errno(unit, errno, "Failed to update environment: %m"); + } + + if (context->dynamic_user && dcreds) { + _cleanup_strv_free_ char **suggested_paths = NULL; + + /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS + * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/ + if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, errno, "Failed to update environment: %m"); + } + + r = compile_suggested_paths(context, params, &suggested_paths); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid); + if (r < 0) { + *exit_status = EXIT_USER; + if (r == -EILSEQ) { + log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists."); + return -EOPNOTSUPP; + } + return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m"); + } + + if (!uid_is_valid(uid)) { + *exit_status = EXIT_USER; + log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid); + return -ESRCH; + } + + if (!gid_is_valid(gid)) { + *exit_status = EXIT_USER; + log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid); + return -ESRCH; + } + + if (dcreds->user) + username = dcreds->user->name; + + } else { + r = get_fixed_user(context, &username, &uid, &gid, &home, &shell); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m"); + } + + r = get_fixed_group(context, &groupname, &gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m"); + } + } + + /* Initialize user supplementary groups and get SupplementaryGroups= ones */ + r = get_supplementary_groups(context, username, groupname, gid, + &supplementary_gids, &ngids); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m"); + } + + r = send_user_lookup(unit, user_lookup_fd, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m"); + } + + user_lookup_fd = safe_close(user_lookup_fd); + + r = acquire_home(context, uid, &home, &home_buffer); + if (r < 0) { + *exit_status = EXIT_CHDIR; + return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m"); + } + + /* If a socket is connected to STDIN/STDOUT/STDERR, we + * must sure to drop O_NONBLOCK */ + if (socket_fd >= 0) + (void) fd_nonblock(socket_fd, false); + + /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields. + * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */ + if (params->cgroup_path) { + _cleanup_free_ char *p = NULL; + + r = exec_parameters_get_cgroup_path(params, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); + } + + r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p); + } + } + + if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) { + r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path); + if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path); + } + } + + r = setup_input(context, params, socket_fd, named_iofds); + if (r < 0) { + *exit_status = EXIT_STDIN; + return log_unit_error_errno(unit, r, "Failed to set up standard input: %m"); + } + + r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + if (r < 0) { + *exit_status = EXIT_STDOUT; + return log_unit_error_errno(unit, r, "Failed to set up standard output: %m"); + } + + r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + if (r < 0) { + *exit_status = EXIT_STDERR; + return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m"); + } + + if (context->oom_score_adjust_set) { + /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces + * prohibit write access to this file, and we shouldn't trip up over that. */ + r = set_oom_score_adjust(context->oom_score_adjust); + if (ERRNO_IS_PRIVILEGE(r)) + log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m"); + else if (r < 0) { + *exit_status = EXIT_OOM_ADJUST; + return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m"); + } + } + + if (context->coredump_filter_set) { + r = set_coredump_filter(context->coredump_filter); + if (ERRNO_IS_PRIVILEGE(r)) + log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m"); + else if (r < 0) + return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m"); + } + + if (context->nice_set) { + r = setpriority_closest(context->nice); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m"); + } + + if (context->cpu_sched_set) { + struct sched_param param = { + .sched_priority = context->cpu_sched_priority, + }; + + r = sched_setscheduler(0, + context->cpu_sched_policy | + (context->cpu_sched_reset_on_fork ? + SCHED_RESET_ON_FORK : 0), + ¶m); + if (r < 0) { + *exit_status = EXIT_SETSCHEDULER; + return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m"); + } + } + + if (context->cpu_affinity_from_numa || context->cpu_set.set) { + _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {}; + const CPUSet *cpu_set; + + if (context->cpu_affinity_from_numa) { + r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set); + if (r < 0) { + *exit_status = EXIT_CPUAFFINITY; + return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m"); + } + + cpu_set = &converted_cpu_set; + } else + cpu_set = &context->cpu_set; + + if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) { + *exit_status = EXIT_CPUAFFINITY; + return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m"); + } + } + + if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) { + r = apply_numa_policy(&context->numa_policy); + if (r == -EOPNOTSUPP) + log_unit_debug_errno(unit, r, "NUMA support not available, ignoring."); + else if (r < 0) { + *exit_status = EXIT_NUMA_POLICY; + return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m"); + } + } + + if (context->ioprio_set) + if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) { + *exit_status = EXIT_IOPRIO; + return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m"); + } + + if (context->timer_slack_nsec != NSEC_INFINITY) + if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) { + *exit_status = EXIT_TIMERSLACK; + return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m"); + } + + if (context->personality != PERSONALITY_INVALID) { + r = safe_personality(context->personality); + if (r < 0) { + *exit_status = EXIT_PERSONALITY; + return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m"); + } + } + + if (context->utmp_id) + utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0), + context->tty_path, + context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS : + context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS : + USER_PROCESS, + username); + + if (uid_is_valid(uid)) { + r = chown_terminal(STDIN_FILENO, uid); + if (r < 0) { + *exit_status = EXIT_STDIN; + return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m"); + } + } + + /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 + * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not + * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only + * touch a single hierarchy too. */ + if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) { + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m"); + } + } + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + r = setup_exec_directory(context, params, uid, gid, dt, exit_status); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]); + } + + if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) { + r = setup_credentials(context, params, unit->id, uid); + if (r < 0) { + *exit_status = EXIT_CREDENTIALS; + return log_unit_error_errno(unit, r, "Failed to set up credentials: %m"); + } + } + + r = build_environment( + unit, + context, + params, + n_fds, + home, + username, + shell, + journal_stream_dev, + journal_stream_ino, + &our_env); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = build_pass_environment(context, &pass_env); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + accum_env = strv_env_merge(5, + params->environment, + our_env, + pass_env, + context->environment, + files_env); + if (!accum_env) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + accum_env = strv_env_clean(accum_env); + + (void) umask(context->umask); + + r = setup_keyring(unit, context, params, uid, gid); + if (r < 0) { + *exit_status = EXIT_KEYRING; + return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m"); + } + + /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */ + needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); + + /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */ + needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported(); + + /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */ + if (needs_ambient_hack) + needs_setuid = false; + else + needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID)); + + if (needs_sandboxing) { + /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being + * present. The actual MAC context application will happen later, as late as possible, to avoid + * impacting our own code paths. */ + +#if HAVE_SELINUX + use_selinux = mac_selinux_use(); +#endif +#if ENABLE_SMACK + use_smack = mac_smack_use(); +#endif +#if HAVE_APPARMOR + use_apparmor = mac_apparmor_use(); +#endif + } + + if (needs_sandboxing) { + int which_failed; + + /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what + * is set here. (See below.) */ + + r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed); + if (r < 0) { + *exit_status = EXIT_LIMITS; + return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + } + } + + if (needs_setuid && context->pam_name && username) { + /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits + * wins here. (See above.) */ + + r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); + if (r < 0) { + *exit_status = EXIT_PAM; + return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m"); + } + + ngids_after_pam = getgroups_alloc(&gids_after_pam); + if (ngids_after_pam < 0) { + *exit_status = EXIT_MEMORY; + return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m"); + } + } + + if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) { + /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. + * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to + * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */ + + userns_set_up = true; + r = setup_private_users(saved_uid, saved_gid, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m"); + } + } + + if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) { + + if (ns_type_supported(NAMESPACE_NET)) { + r = setup_netns(runtime->netns_storage_socket); + if (r == -EPERM) + log_unit_warning_errno(unit, r, + "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m"); + else if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m"); + } + } else if (context->network_namespace_path) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), + "NetworkNamespacePath= is not supported, refusing."); + } else + log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); + } + + needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime); + if (needs_mount_namespace) { + _cleanup_free_ char *error_path = NULL; + + r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m", + error_path ? ": " : "", strempty(error_path)); + } + } + + if (needs_sandboxing) { + r = apply_protect_hostname(unit, context, exit_status); + if (r < 0) + return r; + } + + /* Drop groups as early as possible. + * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root. + * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */ + if (needs_setuid) { + _cleanup_free_ gid_t *gids_to_enforce = NULL; + int ngids_to_enforce = 0; + + ngids_to_enforce = merge_gid_lists(supplementary_gids, + ngids, + gids_after_pam, + ngids_after_pam, + &gids_to_enforce); + if (ngids_to_enforce < 0) { + *exit_status = EXIT_MEMORY; + return log_unit_error_errno(unit, + ngids_to_enforce, + "Failed to merge group lists. Group membership might be incorrect: %m"); + } + + r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce); + if (r < 0) { + *exit_status = EXIT_GROUP; + return log_unit_error_errno(unit, r, "Changing group credentials failed: %m"); + } + } + + /* If the user namespace was not set up above, try to do it now. + * It's preferred to set up the user namespace later (after all other namespaces) so as not to be + * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the + * case of mount namespaces being less privileged when the mount point list is copied from a + * different user namespace). */ + + if (needs_sandboxing && context->private_users && !userns_set_up) { + r = setup_private_users(saved_uid, saved_gid, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m"); + } + } + + /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we + * shall execute. */ + + _cleanup_free_ char *executable = NULL; + r = find_executable_full(command->path, false, &executable); + if (r < 0) { + if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { + log_struct_errno(LOG_INFO, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), + LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m", + command->path), + "EXECUTABLE=%s", command->path); + return 0; + } + + *exit_status = EXIT_EXEC; + return log_struct_errno(LOG_INFO, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), + LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m", + command->path), + "EXECUTABLE=%s", command->path); + } + +#if HAVE_SELINUX + if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) { + r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m"); + } + } +#endif + + /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are + * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd + * however if we have it as we want to keep it open until the final execve(). */ + + if (params->exec_fd >= 0) { + exec_fd = params->exec_fd; + + if (exec_fd < 3 + (int) n_fds) { + int moved_fd; + + /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the + * process we are about to execute. */ + + moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); + if (moved_fd < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); + } + + CLOSE_AND_REPLACE(exec_fd, moved_fd); + } else { + /* This fd should be FD_CLOEXEC already, but let's make sure. */ + r = fd_cloexec(exec_fd, true); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); + } + } + + fds_with_exec_fd = newa(int, n_fds + 1); + memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int)); + fds_with_exec_fd[n_fds] = exec_fd; + n_fds_with_exec_fd = n_fds + 1; + } else { + fds_with_exec_fd = fds; + n_fds_with_exec_fd = n_fds; + } + + r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); + if (r >= 0) + r = shift_fds(fds, n_fds); + if (r >= 0) + r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m"); + } + + /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off + * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined, + * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we + * came this far. */ + + secure_bits = context->secure_bits; + + if (needs_sandboxing) { + uint64_t bset; + + /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly + * requested. (Note this is placed after the general resource limit initialization, see + * above, in order to take precedence.) */ + if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) { + if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) { + *exit_status = EXIT_LIMITS; + return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m"); + } + } + +#if ENABLE_SMACK + /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the + * process. This is the latest place before dropping capabilities. Other MAC context are set later. */ + if (use_smack) { + r = setup_smack(context, executable); + if (r < 0) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m"); + } + } +#endif + + bset = context->capability_bounding_set; + /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for + * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own, + * instead of us doing that */ + if (needs_ambient_hack) + bset |= (UINT64_C(1) << CAP_SETPCAP) | + (UINT64_C(1) << CAP_SETUID) | + (UINT64_C(1) << CAP_SETGID); + + if (!cap_test_all(bset)) { + r = capability_bounding_set_drop(bset, false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m"); + } + } + + /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with + * keep-caps set. + * To be able to raise the ambient capabilities after setresuid() they have to be + * added to the inherited set and keep caps has to be set (done in enforce_user()). + * After setresuid() the ambient capabilities can be raised as they are present in + * the permitted and inhertiable set. However it is possible that someone wants to + * set ambient capabilities without changing the user, so we also set the ambient + * capabilities here. + * The requested ambient capabilities are raised in the inheritable set if the + * second argument is true. */ + if (!needs_ambient_hack) { + r = capability_ambient_set_apply(context->capability_ambient_set, true); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m"); + } + } + } + + /* chroot to root directory first, before we lose the ability to chroot */ + r = apply_root_directory(context, params, needs_mount_namespace, exit_status); + if (r < 0) + return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m"); + + if (needs_setuid) { + if (uid_is_valid(uid)) { + r = enforce_user(context, uid); + if (r < 0) { + *exit_status = EXIT_USER; + return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid); + } + + if (!needs_ambient_hack && + context->capability_ambient_set != 0) { + + /* Raise the ambient capabilities after user change. */ + r = capability_ambient_set_apply(context->capability_ambient_set, false); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m"); + } + } + } + } + + /* Apply working directory here, because the working directory might be on NFS and only the user running + * this service might have the correct privilege to change to the working directory */ + r = apply_working_directory(context, params, home, exit_status); + if (r < 0) + return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m"); + + if (needs_sandboxing) { + /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to + * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires + * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls + * are restricted. */ + +#if HAVE_SELINUX + if (use_selinux) { + char *exec_context = mac_selinux_context_net ?: context->selinux_context; + + if (exec_context) { + r = setexeccon(exec_context); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context); + } + } + } +#endif + +#if HAVE_APPARMOR + if (use_apparmor && context->apparmor_profile) { + r = aa_change_onexec(context->apparmor_profile); + if (r < 0 && !context->apparmor_profile_ignore) { + *exit_status = EXIT_APPARMOR_PROFILE; + return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile); + } + } +#endif + + /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs + * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires + * CAP_SETPCAP. */ + if (prctl(PR_GET_SECUREBITS) != secure_bits) { + /* CAP_SETPCAP is required to set securebits. This capability is raised into the + * effective set here. + * The effective set is overwritten during execve with the following values: + * - ambient set (for non-root processes) + * - (inheritable | bounding) set for root processes) + * + * Hence there is no security impact to raise it in the effective set before execve + */ + r = capability_gain_cap_setpcap(NULL); + if (r < 0) { + *exit_status = EXIT_CAPABILITIES; + return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits"); + } + if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) { + *exit_status = EXIT_SECUREBITS; + return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m"); + } + } + + if (context_has_no_new_privileges(context)) + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + *exit_status = EXIT_NO_NEW_PRIVILEGES; + return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m"); + } + +#if HAVE_SECCOMP + r = apply_address_families(unit, context); + if (r < 0) { + *exit_status = EXIT_ADDRESS_FAMILIES; + return log_unit_error_errno(unit, r, "Failed to restrict address families: %m"); + } + + r = apply_memory_deny_write_execute(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m"); + } + + r = apply_restrict_realtime(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m"); + } + + r = apply_restrict_suid_sgid(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m"); + } + + r = apply_restrict_namespaces(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m"); + } + + r = apply_protect_sysctl(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m"); + } + + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m"); + } + + r = apply_protect_kernel_logs(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m"); + } + + r = apply_protect_clock(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m"); + } + + r = apply_private_devices(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to set up private devices: %m"); + } + + r = apply_syscall_archs(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m"); + } + + r = apply_lock_personality(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to lock personalities: %m"); + } + + r = apply_syscall_log(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m"); + } + + /* This really should remain the last step before the execve(), to make sure our own code is unaffected + * by the filter as little as possible. */ + r = apply_syscall_filter(unit, context, needs_ambient_hack); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m"); + } +#endif + } + + if (!strv_isempty(context->unset_environment)) { + char **ee = NULL; + + ee = strv_env_delete(accum_env, 1, context->unset_environment); + if (!ee) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + strv_free_and_replace(accum_env, ee); + } + + if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) { + replaced_argv = replace_env_argv(command->argv, accum_env); + if (!replaced_argv) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + final_argv = replaced_argv; + } else + final_argv = command->argv; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *line; + + line = exec_command_line(final_argv); + if (line) + log_struct(LOG_DEBUG, + "EXECUTABLE=%s", executable, + LOG_UNIT_MESSAGE(unit, "Executing: %s", line), + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit)); + } + + if (exec_fd >= 0) { + uint8_t hot = 1; + + /* We have finished with all our initializations. Let's now let the manager know that. From this point + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m"); + } + } + + execve(executable, final_argv, accum_env); + r = -errno; + + if (exec_fd >= 0) { + uint8_t hot = 0; + + /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager + * that POLLHUP on it no longer means execve() succeeded. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m"); + } + } + + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable); +} + +static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); +static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]); + +int exec_spawn(Unit *unit, + ExecCommand *command, + const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime, + DynamicCreds *dcreds, + pid_t *ret) { + + int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL; + _cleanup_free_ char *subcgroup_path = NULL; + _cleanup_strv_free_ char **files_env = NULL; + size_t n_storage_fds = 0, n_socket_fds = 0; + _cleanup_free_ char *line = NULL; + pid_t pid; + + assert(unit); + assert(command); + assert(context); + assert(ret); + assert(params); + assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0)); + + if (context->std_input == EXEC_INPUT_SOCKET || + context->std_output == EXEC_OUTPUT_SOCKET || + context->std_error == EXEC_OUTPUT_SOCKET) { + + if (params->n_socket_fds > 1) { + log_unit_error(unit, "Got more than one socket."); + return -EINVAL; + } + + if (params->n_socket_fds == 0) { + log_unit_error(unit, "Got no socket."); + return -EINVAL; + } + + socket_fd = params->fds[0]; + } else { + socket_fd = -1; + fds = params->fds; + n_socket_fds = params->n_socket_fds; + n_storage_fds = params->n_storage_fds; + } + + r = exec_context_named_iofds(context, params, named_iofds); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m"); + + r = exec_context_load_environment(unit, context, &files_env); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to load environment files: %m"); + + line = exec_command_line(command->argv); + if (!line) + return log_oom(); + + /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db + and, until the next SELinux policy changes, we save further reloads in future children. */ + mac_selinux_maybe_reload(); + + log_struct(LOG_DEBUG, + LOG_UNIT_MESSAGE(unit, "About to execute %s", line), + "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create + the mount namespace in the child, but we want to log + from the parent, so we need to use the (possibly + inaccurate) path here. */ + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit)); + + if (params->cgroup_path) { + r = exec_parameters_get_cgroup_path(params, &subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); + if (r > 0) { /* We are using a child cgroup */ + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); + } + } + + pid = fork(); + if (pid < 0) + return log_unit_error_errno(unit, errno, "Failed to fork: %m"); + + if (pid == 0) { + int exit_status = EXIT_SUCCESS; + + r = exec_child(unit, + command, + context, + params, + runtime, + dcreds, + socket_fd, + named_iofds, + fds, + n_socket_fds, + n_storage_fds, + files_env, + unit->manager->user_lookup_fds[1], + &exit_status); + + if (r < 0) { + const char *status = + exit_status_to_string(exit_status, + EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD); + + log_struct_errno(LOG_ERR, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), + LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m", + status, command->path), + "EXECUTABLE=%s", command->path); + } + + _exit(exit_status); + } + + log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid); + + /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever + * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the + * process will be killed too). */ + if (subcgroup_path) + (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid); + + exec_status_start(&command->exec_status, pid); + + *ret = pid; + return 0; +} + +void exec_context_init(ExecContext *c) { + assert(c); + + c->umask = 0022; + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0); + c->cpu_sched_policy = SCHED_OTHER; + c->syslog_priority = LOG_DAEMON|LOG_INFO; + c->syslog_level_prefix = true; + c->ignore_sigpipe = true; + c->timer_slack_nsec = NSEC_INFINITY; + c->personality = PERSONALITY_INVALID; + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) + c->directories[t].mode = 0755; + c->timeout_clean_usec = USEC_INFINITY; + c->capability_bounding_set = CAP_ALL; + assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL); + c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL; + c->log_level_max = -1; +#if HAVE_SECCOMP + c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL; +#endif + numa_policy_reset(&c->numa_policy); +} + +void exec_context_done(ExecContext *c) { + assert(c); + + c->environment = strv_free(c->environment); + c->environment_files = strv_free(c->environment_files); + c->pass_environment = strv_free(c->pass_environment); + c->unset_environment = strv_free(c->unset_environment); + + rlimit_free_all(c->rlimit); + + for (size_t l = 0; l < 3; l++) { + c->stdio_fdname[l] = mfree(c->stdio_fdname[l]); + c->stdio_file[l] = mfree(c->stdio_file[l]); + } + + c->working_directory = mfree(c->working_directory); + c->root_directory = mfree(c->root_directory); + c->root_image = mfree(c->root_image); + c->root_image_options = mount_options_free_all(c->root_image_options); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + c->root_hash_path = mfree(c->root_hash_path); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + c->root_verity = mfree(c->root_verity); + c->tty_path = mfree(c->tty_path); + c->syslog_identifier = mfree(c->syslog_identifier); + c->user = mfree(c->user); + c->group = mfree(c->group); + + c->supplementary_groups = strv_free(c->supplementary_groups); + + c->pam_name = mfree(c->pam_name); + + c->read_only_paths = strv_free(c->read_only_paths); + c->read_write_paths = strv_free(c->read_write_paths); + c->inaccessible_paths = strv_free(c->inaccessible_paths); + + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images); + + cpu_set_reset(&c->cpu_set); + numa_policy_reset(&c->numa_policy); + + c->utmp_id = mfree(c->utmp_id); + c->selinux_context = mfree(c->selinux_context); + c->apparmor_profile = mfree(c->apparmor_profile); + c->smack_process_label = mfree(c->smack_process_label); + + c->syscall_filter = hashmap_free(c->syscall_filter); + c->syscall_archs = set_free(c->syscall_archs); + c->address_families = set_free(c->address_families); + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) + c->directories[t].paths = strv_free(c->directories[t].paths); + + c->log_level_max = -1; + + exec_context_free_log_extra_fields(c); + + c->log_ratelimit_interval_usec = 0; + c->log_ratelimit_burst = 0; + + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + + c->network_namespace_path = mfree(c->network_namespace_path); + + c->log_namespace = mfree(c->log_namespace); + + c->load_credentials = strv_free(c->load_credentials); + c->set_credentials = hashmap_free(c->set_credentials); +} + +int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) { + char **i; + + assert(c); + + if (!runtime_prefix) + return 0; + + STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) { + _cleanup_free_ char *p; + + if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME)) + p = path_join(runtime_prefix, "private", *i); + else + p = path_join(runtime_prefix, *i); + if (!p) + return -ENOMEM; + + /* We execute this synchronously, since we need to be sure this is gone when we start the + * service next. */ + (void) rm_rf(p, REMOVE_ROOT); + } + + return 0; +} + +int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) { + _cleanup_free_ char *p = NULL; + + assert(c); + + if (!runtime_prefix || !unit) + return 0; + + p = path_join(runtime_prefix, "credentials", unit); + if (!p) + return -ENOMEM; + + /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to + * unmount it, and afterwards remove the mount point */ + (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW); + (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD); + + return 0; +} + +static void exec_command_done(ExecCommand *c) { + assert(c); + + c->path = mfree(c->path); + c->argv = strv_free(c->argv); +} + +void exec_command_done_array(ExecCommand *c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) + exec_command_done(c+i); +} + +ExecCommand* exec_command_free_list(ExecCommand *c) { + ExecCommand *i; + + while ((i = c)) { + LIST_REMOVE(command, c, i); + exec_command_done(i); + free(i); + } + + return NULL; +} + +void exec_command_free_array(ExecCommand **c, size_t n) { + for (size_t i = 0; i < n; i++) + c[i] = exec_command_free_list(c[i]); +} + +void exec_command_reset_status_array(ExecCommand *c, size_t n) { + for (size_t i = 0; i < n; i++) + exec_status_reset(&c[i].exec_status); +} + +void exec_command_reset_status_list_array(ExecCommand **c, size_t n) { + for (size_t i = 0; i < n; i++) { + ExecCommand *z; + + LIST_FOREACH(command, z, c[i]) + exec_status_reset(&z->exec_status); + } +} + +typedef struct InvalidEnvInfo { + const Unit *unit; + const char *path; +} InvalidEnvInfo; + +static void invalid_env(const char *p, void *userdata) { + InvalidEnvInfo *info = userdata; + + log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path); +} + +const char* exec_context_fdname(const ExecContext *c, int fd_index) { + assert(c); + + switch (fd_index) { + + case STDIN_FILENO: + if (c->std_input != EXEC_INPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDIN_FILENO] ?: "stdin"; + + case STDOUT_FILENO: + if (c->std_output != EXEC_OUTPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDOUT_FILENO] ?: "stdout"; + + case STDERR_FILENO: + if (c->std_error != EXEC_OUTPUT_NAMED_FD) + return NULL; + + return c->stdio_fdname[STDERR_FILENO] ?: "stderr"; + + default: + return NULL; + } +} + +static int exec_context_named_iofds( + const ExecContext *c, + const ExecParameters *p, + int named_iofds[static 3]) { + + size_t targets; + const char* stdio_fdname[3]; + size_t n_fds; + + assert(c); + assert(p); + assert(named_iofds); + + targets = (c->std_input == EXEC_INPUT_NAMED_FD) + + (c->std_output == EXEC_OUTPUT_NAMED_FD) + + (c->std_error == EXEC_OUTPUT_NAMED_FD); + + for (size_t i = 0; i < 3; i++) + stdio_fdname[i] = exec_context_fdname(c, i); + + n_fds = p->n_storage_fds + p->n_socket_fds; + + for (size_t i = 0; i < n_fds && targets > 0; i++) + if (named_iofds[STDIN_FILENO] < 0 && + c->std_input == EXEC_INPUT_NAMED_FD && + stdio_fdname[STDIN_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) { + + named_iofds[STDIN_FILENO] = p->fds[i]; + targets--; + + } else if (named_iofds[STDOUT_FILENO] < 0 && + c->std_output == EXEC_OUTPUT_NAMED_FD && + stdio_fdname[STDOUT_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) { + + named_iofds[STDOUT_FILENO] = p->fds[i]; + targets--; + + } else if (named_iofds[STDERR_FILENO] < 0 && + c->std_error == EXEC_OUTPUT_NAMED_FD && + stdio_fdname[STDERR_FILENO] && + streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) { + + named_iofds[STDERR_FILENO] = p->fds[i]; + targets--; + } + + return targets == 0 ? 0 : -ENOENT; +} + +static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) { + char **i, **r = NULL; + + assert(c); + assert(l); + + STRV_FOREACH(i, c->environment_files) { + char *fn; + int k; + bool ignore = false; + char **p; + _cleanup_globfree_ glob_t pglob = {}; + + fn = *i; + + if (fn[0] == '-') { + ignore = true; + fn++; + } + + if (!path_is_absolute(fn)) { + if (ignore) + continue; + + strv_free(r); + return -EINVAL; + } + + /* Filename supports globbing, take all matching files */ + k = safe_glob(fn, 0, &pglob); + if (k < 0) { + if (ignore) + continue; + + strv_free(r); + return k; + } + + /* When we don't match anything, -ENOENT should be returned */ + assert(pglob.gl_pathc > 0); + + for (unsigned n = 0; n < pglob.gl_pathc; n++) { + k = load_env_file(NULL, pglob.gl_pathv[n], &p); + if (k < 0) { + if (ignore) + continue; + + strv_free(r); + return k; + } + /* Log invalid environment variables with filename */ + if (p) { + InvalidEnvInfo info = { + .unit = unit, + .path = pglob.gl_pathv[n] + }; + + p = strv_env_clean_with_callback(p, invalid_env, &info); + } + + if (!r) + r = p; + else { + char **m; + + m = strv_env_merge(2, r, p); + strv_free(r); + strv_free(p); + if (!m) + return -ENOMEM; + + r = m; + } + } + } + + *l = r; + + return 0; +} + +static bool tty_may_match_dev_console(const char *tty) { + _cleanup_free_ char *resolved = NULL; + + if (!tty) + return true; + + tty = skip_dev_prefix(tty); + + /* trivial identity? */ + if (streq(tty, "console")) + return true; + + if (resolve_dev_console(&resolved) < 0) + return true; /* if we could not resolve, assume it may */ + + /* "tty0" means the active VC, so it may be the same sometimes */ + return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty)); +} + +static bool exec_context_may_touch_tty(const ExecContext *ec) { + assert(ec); + + return ec->tty_reset || + ec->tty_vhangup || + ec->tty_vt_disallocate || + is_terminal_input(ec->std_input) || + is_terminal_output(ec->std_output) || + is_terminal_output(ec->std_error); +} + +bool exec_context_may_touch_console(const ExecContext *ec) { + + return exec_context_may_touch_tty(ec) && + tty_may_match_dev_console(exec_context_tty_path(ec)); +} + +static void strv_fprintf(FILE *f, char **l) { + char **g; + + assert(f); + + STRV_FOREACH(g, l) + fprintf(f, " %s", *g); +} + +void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { + char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX]; + int r; + + assert(c); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%sUMask: %04o\n" + "%sWorkingDirectory: %s\n" + "%sRootDirectory: %s\n" + "%sNonBlocking: %s\n" + "%sPrivateTmp: %s\n" + "%sPrivateDevices: %s\n" + "%sProtectKernelTunables: %s\n" + "%sProtectKernelModules: %s\n" + "%sProtectKernelLogs: %s\n" + "%sProtectClock: %s\n" + "%sProtectControlGroups: %s\n" + "%sPrivateNetwork: %s\n" + "%sPrivateUsers: %s\n" + "%sProtectHome: %s\n" + "%sProtectSystem: %s\n" + "%sMountAPIVFS: %s\n" + "%sIgnoreSIGPIPE: %s\n" + "%sMemoryDenyWriteExecute: %s\n" + "%sRestrictRealtime: %s\n" + "%sRestrictSUIDSGID: %s\n" + "%sKeyringMode: %s\n" + "%sProtectHostname: %s\n" + "%sProtectProc: %s\n" + "%sProcSubset: %s\n", + prefix, c->umask, + prefix, empty_to_root(c->working_directory), + prefix, empty_to_root(c->root_directory), + prefix, yes_no(c->non_blocking), + prefix, yes_no(c->private_tmp), + prefix, yes_no(c->private_devices), + prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_kernel_modules), + prefix, yes_no(c->protect_kernel_logs), + prefix, yes_no(c->protect_clock), + prefix, yes_no(c->protect_control_groups), + prefix, yes_no(c->private_network), + prefix, yes_no(c->private_users), + prefix, protect_home_to_string(c->protect_home), + prefix, protect_system_to_string(c->protect_system), + prefix, yes_no(exec_context_get_effective_mount_apivfs(c)), + prefix, yes_no(c->ignore_sigpipe), + prefix, yes_no(c->memory_deny_write_execute), + prefix, yes_no(c->restrict_realtime), + prefix, yes_no(c->restrict_suid_sgid), + prefix, exec_keyring_mode_to_string(c->keyring_mode), + prefix, yes_no(c->protect_hostname), + prefix, protect_proc_to_string(c->protect_proc), + prefix, proc_subset_to_string(c->proc_subset)); + + if (c->root_image) + fprintf(f, "%sRootImage: %s\n", prefix, c->root_image); + + if (c->root_image_options) { + MountOptions *o; + + fprintf(f, "%sRootImageOptions:", prefix); + LIST_FOREACH(mount_options, o, c->root_image_options) + if (!isempty(o->options)) + fprintf(f, " %s:%s", + partition_designator_to_string(o->partition_designator), + o->options); + fprintf(f, "\n"); + } + + if (c->root_hash) { + _cleanup_free_ char *encoded = NULL; + encoded = hexmem(c->root_hash, c->root_hash_size); + if (encoded) + fprintf(f, "%sRootHash: %s\n", prefix, encoded); + } + + if (c->root_hash_path) + fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path); + + if (c->root_hash_sig) { + _cleanup_free_ char *encoded = NULL; + ssize_t len; + len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded); + if (len) + fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded); + } + + if (c->root_hash_sig_path) + fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path); + + if (c->root_verity) + fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity); + + STRV_FOREACH(e, c->environment) + fprintf(f, "%sEnvironment: %s\n", prefix, *e); + + STRV_FOREACH(e, c->environment_files) + fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e); + + STRV_FOREACH(e, c->pass_environment) + fprintf(f, "%sPassEnvironment: %s\n", prefix, *e); + + STRV_FOREACH(e, c->unset_environment) + fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e); + + fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode)); + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode); + + STRV_FOREACH(d, c->directories[dt].paths) + fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d); + } + + fprintf(f, + "%sTimeoutCleanSec: %s\n", + prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC)); + + if (c->nice_set) + fprintf(f, + "%sNice: %i\n", + prefix, c->nice); + + if (c->oom_score_adjust_set) + fprintf(f, + "%sOOMScoreAdjust: %i\n", + prefix, c->oom_score_adjust); + + if (c->coredump_filter_set) + fprintf(f, + "%sCoredumpFilter: 0x%"PRIx64"\n", + prefix, c->coredump_filter); + + for (unsigned i = 0; i < RLIM_NLIMITS; i++) + if (c->rlimit[i]) { + fprintf(f, "%sLimit%s: " RLIM_FMT "\n", + prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max); + fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n", + prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur); + } + + if (c->ioprio_set) { + _cleanup_free_ char *class_str = NULL; + + r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str); + if (r >= 0) + fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str); + + fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio)); + } + + if (c->cpu_sched_set) { + _cleanup_free_ char *policy_str = NULL; + + r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str); + if (r >= 0) + fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str); + + fprintf(f, + "%sCPUSchedulingPriority: %i\n" + "%sCPUSchedulingResetOnFork: %s\n", + prefix, c->cpu_sched_priority, + prefix, yes_no(c->cpu_sched_reset_on_fork)); + } + + if (c->cpu_set.set) { + _cleanup_free_ char *affinity = NULL; + + affinity = cpu_set_to_range_string(&c->cpu_set); + fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity); + } + + if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) { + _cleanup_free_ char *nodes = NULL; + + nodes = cpu_set_to_range_string(&c->numa_policy.nodes); + fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy))); + fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes)); + } + + if (c->timer_slack_nsec != NSEC_INFINITY) + fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec); + + fprintf(f, + "%sStandardInput: %s\n" + "%sStandardOutput: %s\n" + "%sStandardError: %s\n", + prefix, exec_input_to_string(c->std_input), + prefix, exec_output_to_string(c->std_output), + prefix, exec_output_to_string(c->std_error)); + + if (c->std_input == EXEC_INPUT_NAMED_FD) + fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]); + if (c->std_output == EXEC_OUTPUT_NAMED_FD) + fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]); + if (c->std_error == EXEC_OUTPUT_NAMED_FD) + fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]); + + if (c->std_input == EXEC_INPUT_FILE) + fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE) + fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE) + fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + + if (c->tty_path) + fprintf(f, + "%sTTYPath: %s\n" + "%sTTYReset: %s\n" + "%sTTYVHangup: %s\n" + "%sTTYVTDisallocate: %s\n", + prefix, c->tty_path, + prefix, yes_no(c->tty_reset), + prefix, yes_no(c->tty_vhangup), + prefix, yes_no(c->tty_vt_disallocate)); + + if (IN_SET(c->std_output, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE) || + IN_SET(c->std_error, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) { + + _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL; + + r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str); + if (r >= 0) + fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str); + + r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str); + if (r >= 0) + fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str); + } + + if (c->log_level_max >= 0) { + _cleanup_free_ char *t = NULL; + + (void) log_level_to_string_alloc(c->log_level_max, &t); + + fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t)); + } + + if (c->log_ratelimit_interval_usec > 0) { + char buf_timespan[FORMAT_TIMESPAN_MAX]; + + fprintf(f, + "%sLogRateLimitIntervalSec: %s\n", + prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC)); + } + + if (c->log_ratelimit_burst > 0) + fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst); + + for (size_t j = 0; j < c->n_log_extra_fields; j++) { + fprintf(f, "%sLogExtraFields: ", prefix); + fwrite(c->log_extra_fields[j].iov_base, + 1, c->log_extra_fields[j].iov_len, + f); + fputc('\n', f); + } + + if (c->log_namespace) + fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace); + + if (c->secure_bits) { + _cleanup_free_ char *str = NULL; + + r = secure_bits_to_string_alloc(c->secure_bits, &str); + if (r >= 0) + fprintf(f, "%sSecure Bits: %s\n", prefix, str); + } + + if (c->capability_bounding_set != CAP_ALL) { + _cleanup_free_ char *str = NULL; + + r = capability_set_to_string_alloc(c->capability_bounding_set, &str); + if (r >= 0) + fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str); + } + + if (c->capability_ambient_set != 0) { + _cleanup_free_ char *str = NULL; + + r = capability_set_to_string_alloc(c->capability_ambient_set, &str); + if (r >= 0) + fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str); + } + + if (c->user) + fprintf(f, "%sUser: %s\n", prefix, c->user); + if (c->group) + fprintf(f, "%sGroup: %s\n", prefix, c->group); + + fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user)); + + if (!strv_isempty(c->supplementary_groups)) { + fprintf(f, "%sSupplementaryGroups:", prefix); + strv_fprintf(f, c->supplementary_groups); + fputs("\n", f); + } + + if (c->pam_name) + fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name); + + if (!strv_isempty(c->read_write_paths)) { + fprintf(f, "%sReadWritePaths:", prefix); + strv_fprintf(f, c->read_write_paths); + fputs("\n", f); + } + + if (!strv_isempty(c->read_only_paths)) { + fprintf(f, "%sReadOnlyPaths:", prefix); + strv_fprintf(f, c->read_only_paths); + fputs("\n", f); + } + + if (!strv_isempty(c->inaccessible_paths)) { + fprintf(f, "%sInaccessiblePaths:", prefix); + strv_fprintf(f, c->inaccessible_paths); + fputs("\n", f); + } + + for (size_t i = 0; i < c->n_bind_mounts; i++) + fprintf(f, "%s%s: %s%s:%s:%s\n", prefix, + c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths", + c->bind_mounts[i].ignore_enoent ? "-": "", + c->bind_mounts[i].source, + c->bind_mounts[i].destination, + c->bind_mounts[i].recursive ? "rbind" : "norbind"); + + for (size_t i = 0; i < c->n_temporary_filesystems; i++) { + const TemporaryFileSystem *t = c->temporary_filesystems + i; + + fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix, + t->path, + isempty(t->options) ? "" : ":", + strempty(t->options)); + } + + if (c->utmp_id) + fprintf(f, + "%sUtmpIdentifier: %s\n", + prefix, c->utmp_id); + + if (c->selinux_context) + fprintf(f, + "%sSELinuxContext: %s%s\n", + prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context); + + if (c->apparmor_profile) + fprintf(f, + "%sAppArmorProfile: %s%s\n", + prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile); + + if (c->smack_process_label) + fprintf(f, + "%sSmackProcessLabel: %s%s\n", + prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label); + + if (c->personality != PERSONALITY_INVALID) + fprintf(f, + "%sPersonality: %s\n", + prefix, strna(personality_to_string(c->personality))); + + fprintf(f, + "%sLockPersonality: %s\n", + prefix, yes_no(c->lock_personality)); + + if (c->syscall_filter) { +#if HAVE_SECCOMP + void *id, *val; + bool first = true; +#endif + + fprintf(f, + "%sSystemCallFilter: ", + prefix); + + if (!c->syscall_allow_list) + fputc('~', f); + +#if HAVE_SECCOMP + HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) { + _cleanup_free_ char *name = NULL; + const char *errno_name = NULL; + int num = PTR_TO_INT(val); + + if (first) + first = false; + else + fputc(' ', f); + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + fputs(strna(name), f); + + if (num >= 0) { + errno_name = seccomp_errno_or_action_to_string(num); + if (errno_name) + fprintf(f, ":%s", errno_name); + else + fprintf(f, ":%d", num); + } + } +#endif + + fputc('\n', f); + } + + if (c->syscall_archs) { +#if HAVE_SECCOMP + void *id; +#endif + + fprintf(f, + "%sSystemCallArchitectures:", + prefix); + +#if HAVE_SECCOMP + SET_FOREACH(id, c->syscall_archs) + fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1))); +#endif + fputc('\n', f); + } + + if (exec_context_restrict_namespaces_set(c)) { + _cleanup_free_ char *s = NULL; + + r = namespace_flags_to_string(c->restrict_namespaces, &s); + if (r >= 0) + fprintf(f, "%sRestrictNamespaces: %s\n", + prefix, strna(s)); + } + + if (c->network_namespace_path) + fprintf(f, + "%sNetworkNamespacePath: %s\n", + prefix, c->network_namespace_path); + + if (c->syscall_errno > 0) { +#if HAVE_SECCOMP + const char *errno_name; +#endif + + fprintf(f, "%sSystemCallErrorNumber: ", prefix); + +#if HAVE_SECCOMP + errno_name = seccomp_errno_or_action_to_string(c->syscall_errno); + if (errno_name) + fputs(errno_name, f); + else + fprintf(f, "%d", c->syscall_errno); +#endif + fputc('\n', f); + } + + for (size_t i = 0; i < c->n_mount_images; i++) { + MountOptions *o; + + fprintf(f, "%sMountImages: %s%s:%s%s", prefix, + c->mount_images[i].ignore_enoent ? "-": "", + c->mount_images[i].source, + c->mount_images[i].destination, + LIST_IS_EMPTY(c->mount_images[i].mount_options) ? "": ":"); + LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options) + fprintf(f, "%s:%s", + partition_designator_to_string(o->partition_designator), + o->options); + fprintf(f, "\n"); + } +} + +bool exec_context_maintains_privileges(const ExecContext *c) { + assert(c); + + /* Returns true if the process forked off would run under + * an unchanged UID or as root. */ + + if (!c->user) + return true; + + if (streq(c->user, "root") || streq(c->user, "0")) + return true; + + return false; +} + +int exec_context_get_effective_ioprio(const ExecContext *c) { + int p; + + assert(c); + + if (c->ioprio_set) + return c->ioprio; + + p = ioprio_get(IOPRIO_WHO_PROCESS, 0); + if (p < 0) + return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4); + + return p; +} + +bool exec_context_get_effective_mount_apivfs(const ExecContext *c) { + assert(c); + + /* Explicit setting wins */ + if (c->mount_apivfs_set) + return c->mount_apivfs; + + /* Default to "yes" if root directory or image are specified */ + if (exec_context_with_rootfs(c)) + return true; + + return false; +} + +void exec_context_free_log_extra_fields(ExecContext *c) { + assert(c); + + for (size_t l = 0; l < c->n_log_extra_fields; l++) + free(c->log_extra_fields[l].iov_base); + c->log_extra_fields = mfree(c->log_extra_fields); + c->n_log_extra_fields = 0; +} + +void exec_context_revert_tty(ExecContext *c) { + int r; + + assert(c); + + /* First, reset the TTY (possibly kicking everybody else from the TTY) */ + exec_context_tty_reset(c, NULL); + + /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path + * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed + * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */ + + if (exec_context_may_touch_tty(c)) { + const char *path; + + path = exec_context_tty_path(c); + if (path) { + r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID); + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path); + } + } +} + +int exec_context_get_clean_directories( + ExecContext *c, + char **prefix, + ExecCleanMask mask, + char ***ret) { + + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(c); + assert(prefix); + assert(ret); + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **i; + + if (!FLAGS_SET(mask, 1U << t)) + continue; + + if (!prefix[t]) + continue; + + STRV_FOREACH(i, c->directories[t].paths) { + char *j; + + j = path_join(prefix[t], *i); + if (!j) + return -ENOMEM; + + r = strv_consume(&l, j); + if (r < 0) + return r; + + /* Also remove private directories unconditionally. */ + if (t != EXEC_DIRECTORY_CONFIGURATION) { + j = path_join(prefix[t], "private", *i); + if (!j) + return -ENOMEM; + + r = strv_consume(&l, j); + if (r < 0) + return r; + } + } + } + + *ret = TAKE_PTR(l); + return 0; +} + +int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) { + ExecCleanMask mask = 0; + + assert(c); + assert(ret); + + for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) + if (!strv_isempty(c->directories[t].paths)) + mask |= 1U << t; + + *ret = mask; + return 0; +} + +void exec_status_start(ExecStatus *s, pid_t pid) { + assert(s); + + *s = (ExecStatus) { + .pid = pid, + }; + + dual_timestamp_get(&s->start_timestamp); +} + +void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) { + assert(s); + + if (s->pid != pid) + *s = (ExecStatus) { + .pid = pid, + }; + + dual_timestamp_get(&s->exit_timestamp); + + s->code = code; + s->status = status; + + if (context && context->utmp_id) + (void) utmp_put_dead_process(context->utmp_id, pid, code, status); +} + +void exec_status_reset(ExecStatus *s) { + assert(s); + + *s = (ExecStatus) {}; +} + +void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESTAMP_MAX]; + + assert(s); + assert(f); + + if (s->pid <= 0) + return; + + prefix = strempty(prefix); + + fprintf(f, + "%sPID: "PID_FMT"\n", + prefix, s->pid); + + if (dual_timestamp_is_set(&s->start_timestamp)) + fprintf(f, + "%sStart Timestamp: %s\n", + prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime)); + + if (dual_timestamp_is_set(&s->exit_timestamp)) + fprintf(f, + "%sExit Timestamp: %s\n" + "%sExit Code: %s\n" + "%sExit Status: %i\n", + prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime), + prefix, sigchld_code_to_string(s->code), + prefix, s->status); +} + +static char *exec_command_line(char **argv) { + size_t k; + char *n, *p, **a; + bool first = true; + + assert(argv); + + k = 1; + STRV_FOREACH(a, argv) + k += strlen(*a)+3; + + n = new(char, k); + if (!n) + return NULL; + + p = n; + STRV_FOREACH(a, argv) { + + if (!first) + *(p++) = ' '; + else + first = false; + + if (strpbrk(*a, WHITESPACE)) { + *(p++) = '\''; + p = stpcpy(p, *a); + *(p++) = '\''; + } else + p = stpcpy(p, *a); + + } + + *p = 0; + + /* FIXME: this doesn't really handle arguments that have + * spaces and ticks in them */ + + return n; +} + +static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) { + _cleanup_free_ char *cmd = NULL; + const char *prefix2; + + assert(c); + assert(f); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + cmd = exec_command_line(c->argv); + fprintf(f, + "%sCommand Line: %s\n", + prefix, cmd ? cmd : strerror_safe(ENOMEM)); + + exec_status_dump(&c->exec_status, f, prefix2); +} + +void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) { + assert(f); + + prefix = strempty(prefix); + + LIST_FOREACH(command, c, c) + exec_command_dump(c, f, prefix); +} + +void exec_command_append_list(ExecCommand **l, ExecCommand *e) { + ExecCommand *end; + + assert(l); + assert(e); + + if (*l) { + /* It's kind of important, that we keep the order here */ + LIST_FIND_TAIL(command, *l, end); + LIST_INSERT_AFTER(command, *l, end, e); + } else + *l = e; +} + +int exec_command_set(ExecCommand *c, const char *path, ...) { + va_list ap; + char **l, *p; + + assert(c); + assert(path); + + va_start(ap, path); + l = strv_new_ap(path, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + + p = strdup(path); + if (!p) { + strv_free(l); + return -ENOMEM; + } + + free_and_replace(c->path, p); + + return strv_free_and_replace(c->argv, l); +} + +int exec_command_append(ExecCommand *c, const char *path, ...) { + _cleanup_strv_free_ char **l = NULL; + va_list ap; + int r; + + assert(c); + assert(path); + + va_start(ap, path); + l = strv_new_ap(path, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + + r = strv_extend_strv(&c->argv, l, false); + if (r < 0) + return r; + + return 0; +} + +static void *remove_tmpdir_thread(void *p) { + _cleanup_free_ char *path = p; + + (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL); + return NULL; +} + +static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) { + int r; + + if (!rt) + return NULL; + + if (rt->manager) + (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id); + + /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */ + + if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) { + log_debug("Spawning thread to nuke %s", rt->tmp_dir); + + r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir); + if (r < 0) + log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir); + else + rt->tmp_dir = NULL; + } + + if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) { + log_debug("Spawning thread to nuke %s", rt->var_tmp_dir); + + r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir); + if (r < 0) + log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir); + else + rt->var_tmp_dir = NULL; + } + + rt->id = mfree(rt->id); + rt->tmp_dir = mfree(rt->tmp_dir); + rt->var_tmp_dir = mfree(rt->var_tmp_dir); + safe_close_pair(rt->netns_storage_socket); + return mfree(rt); +} + +static void exec_runtime_freep(ExecRuntime **rt) { + (void) exec_runtime_free(*rt, false); +} + +static int exec_runtime_allocate(ExecRuntime **ret, const char *id) { + _cleanup_free_ char *id_copy = NULL; + ExecRuntime *n; + + assert(ret); + + id_copy = strdup(id); + if (!id_copy) + return -ENOMEM; + + n = new(ExecRuntime, 1); + if (!n) + return -ENOMEM; + + *n = (ExecRuntime) { + .id = TAKE_PTR(id_copy), + .netns_storage_socket = { -1, -1 }, + }; + + *ret = n; + return 0; +} + +static int exec_runtime_add( + Manager *m, + const char *id, + char **tmp_dir, + char **var_tmp_dir, + int netns_storage_socket[2], + ExecRuntime **ret) { + + _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL; + int r; + + assert(m); + assert(id); + + /* tmp_dir, var_tmp_dir, netns_storage_socket fds are donated on success */ + + r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops); + if (r < 0) + return r; + + r = exec_runtime_allocate(&rt, id); + if (r < 0) + return r; + + r = hashmap_put(m->exec_runtime_by_id, rt->id, rt); + if (r < 0) + return r; + + assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */ + rt->tmp_dir = TAKE_PTR(*tmp_dir); + rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir); + + if (netns_storage_socket) { + rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]); + rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]); + } + + rt->manager = m; + + if (ret) + *ret = rt; + /* do not remove created ExecRuntime object when the operation succeeds. */ + TAKE_PTR(rt); + return 0; +} + +static int exec_runtime_make( + Manager *m, + const ExecContext *c, + const char *id, + ExecRuntime **ret) { + + _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL; + _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }; + int r; + + assert(m); + assert(c); + assert(id); + + /* It is not necessary to create ExecRuntime object. */ + if (!c->private_network && !c->private_tmp && !c->network_namespace_path) { + *ret = NULL; + return 0; + } + + if (c->private_tmp && + !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") && + (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") || + prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) { + r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir); + if (r < 0) + return r; + } + + if (c->private_network || c->network_namespace_path) { + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0) + return -errno; + } + + r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ret); + if (r < 0) + return r; + + return 1; +} + +int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) { + ExecRuntime *rt; + int r; + + assert(m); + assert(id); + assert(ret); + + rt = hashmap_get(m->exec_runtime_by_id, id); + if (rt) + /* We already have a ExecRuntime object, let's increase the ref count and reuse it */ + goto ref; + + if (!create) { + *ret = NULL; + return 0; + } + + /* If not found, then create a new object. */ + r = exec_runtime_make(m, c, id, &rt); + if (r < 0) + return r; + if (r == 0) { + /* When r == 0, it is not necessary to create ExecRuntime object. */ + *ret = NULL; + return 0; + } + +ref: + /* increment reference counter. */ + rt->n_ref++; + *ret = rt; + return 1; +} + +ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) { + if (!rt) + return NULL; + + assert(rt->n_ref > 0); + + rt->n_ref--; + if (rt->n_ref > 0) + return NULL; + + return exec_runtime_free(rt, destroy); +} + +int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) { + ExecRuntime *rt; + + assert(m); + assert(f); + assert(fds); + + HASHMAP_FOREACH(rt, m->exec_runtime_by_id) { + fprintf(f, "exec-runtime=%s", rt->id); + + if (rt->tmp_dir) + fprintf(f, " tmp-dir=%s", rt->tmp_dir); + + if (rt->var_tmp_dir) + fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir); + + if (rt->netns_storage_socket[0] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->netns_storage_socket[0]); + if (copy < 0) + return copy; + + fprintf(f, " netns-socket-0=%i", copy); + } + + if (rt->netns_storage_socket[1] >= 0) { + int copy; + + copy = fdset_put_dup(fds, rt->netns_storage_socket[1]); + if (copy < 0) + return copy; + + fprintf(f, " netns-socket-1=%i", copy); + } + + fputc('\n', f); + } + + return 0; +} + +int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) { + _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL; + ExecRuntime *rt; + int r; + + /* This is for the migration from old (v237 or earlier) deserialization text. + * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=. + * Even if the ExecRuntime object originally created by the other unit, we cannot judge + * so or not from the serialized text, then we always creates a new object owned by this. */ + + assert(u); + assert(key); + assert(value); + + /* Manager manages ExecRuntime objects by the unit id. + * So, we omit the serialized text when the unit does not have id (yet?)... */ + if (isempty(u->id)) { + log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter."); + return 0; + } + + r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m"); + return 0; + } + + rt = hashmap_get(u->manager->exec_runtime_by_id, u->id); + if (!rt) { + r = exec_runtime_allocate(&rt_create, u->id); + if (r < 0) + return log_oom(); + + rt = rt_create; + } + + if (streq(key, "tmp-dir")) { + char *copy; + + copy = strdup(value); + if (!copy) + return log_oom(); + + free_and_replace(rt->tmp_dir, copy); + + } else if (streq(key, "var-tmp-dir")) { + char *copy; + + copy = strdup(value); + if (!copy) + return log_oom(); + + free_and_replace(rt->var_tmp_dir, copy); + + } else if (streq(key, "netns-socket-0")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Failed to parse netns socket value: %s", value); + return 0; + } + + safe_close(rt->netns_storage_socket[0]); + rt->netns_storage_socket[0] = fdset_remove(fds, fd); + + } else if (streq(key, "netns-socket-1")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Failed to parse netns socket value: %s", value); + return 0; + } + + safe_close(rt->netns_storage_socket[1]); + rt->netns_storage_socket[1] = fdset_remove(fds, fd); + } else + return 0; + + /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */ + if (rt_create) { + r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m"); + return 0; + } + + rt_create->manager = u->manager; + + /* Avoid cleanup */ + TAKE_PTR(rt_create); + } + + return 1; +} + +int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) { + _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL; + char *id = NULL; + int r, fdpair[] = {-1, -1}; + const char *p, *v = value; + size_t n; + + assert(m); + assert(value); + assert(fds); + + n = strcspn(v, " "); + id = strndupa(v, n); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + + v = startswith(p, "tmp-dir="); + if (v) { + n = strcspn(v, " "); + tmp_dir = strndup(v, n); + if (!tmp_dir) + return log_oom(); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "var-tmp-dir="); + if (v) { + n = strcspn(v, " "); + var_tmp_dir = strndup(v, n); + if (!var_tmp_dir) + return log_oom(); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "netns-socket-0="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa(v, n); + + r = safe_atoi(buf, &fdpair[0]); + if (r < 0) + return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf); + if (!fdset_contains(fds, fdpair[0])) + return log_debug_errno(SYNTHETIC_ERRNO(EBADF), + "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", fdpair[0]); + fdpair[0] = fdset_remove(fds, fdpair[0]); + if (v[n] != ' ') + goto finalize; + p = v + n + 1; + } + + v = startswith(p, "netns-socket-1="); + if (v) { + char *buf; + + n = strcspn(v, " "); + buf = strndupa(v, n); + r = safe_atoi(buf, &fdpair[1]); + if (r < 0) + return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf); + if (!fdset_contains(fds, fdpair[1])) + return log_debug_errno(SYNTHETIC_ERRNO(EBADF), + "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", fdpair[1]); + fdpair[1] = fdset_remove(fds, fdpair[1]); + } + +finalize: + r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, fdpair, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to add exec-runtime: %m"); + return 0; +} + +void exec_runtime_vacuum(Manager *m) { + ExecRuntime *rt; + + assert(m); + + /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */ + + HASHMAP_FOREACH(rt, m->exec_runtime_by_id) { + if (rt->n_ref > 0) + continue; + + (void) exec_runtime_free(rt, false); + } +} + +void exec_params_clear(ExecParameters *p) { + if (!p) + return; + + p->environment = strv_free(p->environment); + p->fd_names = strv_free(p->fd_names); + p->fds = mfree(p->fds); + p->exec_fd = safe_close(p->exec_fd); +} + +ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) { + if (!sc) + return NULL; + + free(sc->id); + free(sc->data); + return mfree(sc); +} + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free); + +static const char* const exec_input_table[_EXEC_INPUT_MAX] = { + [EXEC_INPUT_NULL] = "null", + [EXEC_INPUT_TTY] = "tty", + [EXEC_INPUT_TTY_FORCE] = "tty-force", + [EXEC_INPUT_TTY_FAIL] = "tty-fail", + [EXEC_INPUT_SOCKET] = "socket", + [EXEC_INPUT_NAMED_FD] = "fd", + [EXEC_INPUT_DATA] = "data", + [EXEC_INPUT_FILE] = "file", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput); + +static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = { + [EXEC_OUTPUT_INHERIT] = "inherit", + [EXEC_OUTPUT_NULL] = "null", + [EXEC_OUTPUT_TTY] = "tty", + [EXEC_OUTPUT_KMSG] = "kmsg", + [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console", + [EXEC_OUTPUT_JOURNAL] = "journal", + [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console", + [EXEC_OUTPUT_SOCKET] = "socket", + [EXEC_OUTPUT_NAMED_FD] = "fd", + [EXEC_OUTPUT_FILE] = "file", + [EXEC_OUTPUT_FILE_APPEND] = "append", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput); + +static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = { + [EXEC_UTMP_INIT] = "init", + [EXEC_UTMP_LOGIN] = "login", + [EXEC_UTMP_USER] = "user", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode); + +static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = { + [EXEC_PRESERVE_NO] = "no", + [EXEC_PRESERVE_YES] = "yes", + [EXEC_PRESERVE_RESTART] = "restart", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES); + +/* This table maps ExecDirectoryType to the setting it is configured with in the unit */ +static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory", + [EXEC_DIRECTORY_STATE] = "StateDirectory", + [EXEC_DIRECTORY_CACHE] = "CacheDirectory", + [EXEC_DIRECTORY_LOGS] = "LogsDirectory", + [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType); + +/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This + * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit + * directories, specifically .timer units with their timestamp touch file. */ +static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "runtime", + [EXEC_DIRECTORY_STATE] = "state", + [EXEC_DIRECTORY_CACHE] = "cache", + [EXEC_DIRECTORY_LOGS] = "logs", + [EXEC_DIRECTORY_CONFIGURATION] = "configuration", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType); + +/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to + * the service payload in. */ +static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY", + [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY", + [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY", + [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY", + [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType); + +static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = { + [EXEC_KEYRING_INHERIT] = "inherit", + [EXEC_KEYRING_PRIVATE] = "private", + [EXEC_KEYRING_SHARED] = "shared", +}; + +DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode); diff --git a/src/core/execute.h b/src/core/execute.h new file mode 100644 index 0000000..33d7e16 --- /dev/null +++ b/src/core/execute.h @@ -0,0 +1,472 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct ExecStatus ExecStatus; +typedef struct ExecCommand ExecCommand; +typedef struct ExecContext ExecContext; +typedef struct ExecRuntime ExecRuntime; +typedef struct ExecParameters ExecParameters; +typedef struct Manager Manager; + +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <sys/capability.h> + +#include "cgroup-util.h" +#include "coredump-util.h" +#include "cpu-set-util.h" +#include "exec-util.h" +#include "fdset.h" +#include "list.h" +#include "missing_resource.h" +#include "namespace.h" +#include "nsflags.h" +#include "numa-util.h" +#include "path-util.h" +#include "time-util.h" + +#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U) + +typedef enum ExecUtmpMode { + EXEC_UTMP_INIT, + EXEC_UTMP_LOGIN, + EXEC_UTMP_USER, + _EXEC_UTMP_MODE_MAX, + _EXEC_UTMP_MODE_INVALID = -1 +} ExecUtmpMode; + +typedef enum ExecInput { + EXEC_INPUT_NULL, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL, + EXEC_INPUT_SOCKET, + EXEC_INPUT_NAMED_FD, + EXEC_INPUT_DATA, + EXEC_INPUT_FILE, + _EXEC_INPUT_MAX, + _EXEC_INPUT_INVALID = -1 +} ExecInput; + +typedef enum ExecOutput { + EXEC_OUTPUT_INHERIT, + EXEC_OUTPUT_NULL, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_KMSG, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_SOCKET, + EXEC_OUTPUT_NAMED_FD, + EXEC_OUTPUT_FILE, + EXEC_OUTPUT_FILE_APPEND, + _EXEC_OUTPUT_MAX, + _EXEC_OUTPUT_INVALID = -1 +} ExecOutput; + +typedef enum ExecPreserveMode { + EXEC_PRESERVE_NO, + EXEC_PRESERVE_YES, + EXEC_PRESERVE_RESTART, + _EXEC_PRESERVE_MODE_MAX, + _EXEC_PRESERVE_MODE_INVALID = -1 +} ExecPreserveMode; + +typedef enum ExecKeyringMode { + EXEC_KEYRING_INHERIT, + EXEC_KEYRING_PRIVATE, + EXEC_KEYRING_SHARED, + _EXEC_KEYRING_MODE_MAX, + _EXEC_KEYRING_MODE_INVALID = -1, +} ExecKeyringMode; + +/* Contains start and exit information about an executed command. */ +struct ExecStatus { + dual_timestamp start_timestamp; + dual_timestamp exit_timestamp; + pid_t pid; + int code; /* as in siginfo_t::si_code */ + int status; /* as in siginfo_t::si_status */ +}; + +/* Stores information about commands we execute. Covers both configuration settings as well as runtime data. */ +struct ExecCommand { + char *path; + char **argv; + ExecStatus exec_status; + ExecCommandFlags flags; + LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */ +}; + +/* Encapsulates certain aspects of the runtime environment that is to be shared between multiple otherwise separate + * invocations of commands. Specifically, this allows sharing of /tmp and /var/tmp data as well as network namespaces + * between invocations of commands. This is a reference counted object, with one reference taken by each currently + * active command invocation that wants to share this runtime. */ +struct ExecRuntime { + unsigned n_ref; + + Manager *manager; + + char *id; /* Unit id of the owner */ + + char *tmp_dir; + char *var_tmp_dir; + + /* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network + * namespace. */ + int netns_storage_socket[2]; +}; + +typedef enum ExecDirectoryType { + EXEC_DIRECTORY_RUNTIME = 0, + EXEC_DIRECTORY_STATE, + EXEC_DIRECTORY_CACHE, + EXEC_DIRECTORY_LOGS, + EXEC_DIRECTORY_CONFIGURATION, + _EXEC_DIRECTORY_TYPE_MAX, + _EXEC_DIRECTORY_TYPE_INVALID = -1, +} ExecDirectoryType; + +typedef struct ExecDirectory { + char **paths; + mode_t mode; +} ExecDirectory; + +typedef enum ExecCleanMask { + /* In case you wonder why the bitmask below doesn't use "directory" in its name: we want to keep this + * generic so that .timer timestamp files can nicely be covered by this too, and similar. */ + EXEC_CLEAN_RUNTIME = 1U << EXEC_DIRECTORY_RUNTIME, + EXEC_CLEAN_STATE = 1U << EXEC_DIRECTORY_STATE, + EXEC_CLEAN_CACHE = 1U << EXEC_DIRECTORY_CACHE, + EXEC_CLEAN_LOGS = 1U << EXEC_DIRECTORY_LOGS, + EXEC_CLEAN_CONFIGURATION = 1U << EXEC_DIRECTORY_CONFIGURATION, + EXEC_CLEAN_NONE = 0, + EXEC_CLEAN_ALL = (1U << _EXEC_DIRECTORY_TYPE_MAX) - 1, + _EXEC_CLEAN_MASK_INVALID = -1, +} ExecCleanMask; + +/* A credential configured with SetCredential= */ +typedef struct ExecSetCredential { + char *id; + void *data; + size_t size; +} ExecSetCredential; + +/* Encodes configuration parameters applied to invoked commands. Does not carry runtime data, but only configuration + * changes sourced from unit files and suchlike. ExecContext objects are usually embedded into Unit objects, and do not + * change after being loaded. */ +struct ExecContext { + char **environment; + char **environment_files; + char **pass_environment; + char **unset_environment; + + struct rlimit *rlimit[_RLIMIT_MAX]; + char *working_directory, *root_directory, *root_image, *root_verity, *root_hash_path, *root_hash_sig_path; + void *root_hash, *root_hash_sig; + size_t root_hash_size, root_hash_sig_size; + LIST_HEAD(MountOptions, root_image_options); + bool working_directory_missing_ok:1; + bool working_directory_home:1; + + bool oom_score_adjust_set:1; + bool coredump_filter_set:1; + bool nice_set:1; + bool ioprio_set:1; + bool cpu_sched_set:1; + bool mount_apivfs_set:1; + + /* This is not exposed to the user but available internally. We need it to make sure that whenever we + * spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects + * that it belongs to us and we don't enter a trigger loop. */ + bool same_pgrp; + + bool cpu_sched_reset_on_fork; + bool non_blocking; + + mode_t umask; + int oom_score_adjust; + int nice; + int ioprio; + int cpu_sched_policy; + int cpu_sched_priority; + uint64_t coredump_filter; + + CPUSet cpu_set; + NUMAPolicy numa_policy; + bool cpu_affinity_from_numa; + + ExecInput std_input; + ExecOutput std_output; + ExecOutput std_error; + bool stdio_as_fds; + char *stdio_fdname[3]; + char *stdio_file[3]; + + void *stdin_data; + size_t stdin_data_size; + + nsec_t timer_slack_nsec; + + char *tty_path; + + bool tty_reset; + bool tty_vhangup; + bool tty_vt_disallocate; + + bool ignore_sigpipe; + + ExecKeyringMode keyring_mode; + + /* Since resolving these names might involve socket + * connections and we don't want to deadlock ourselves these + * names are resolved on execution only and in the child + * process. */ + char *user; + char *group; + char **supplementary_groups; + + char *pam_name; + + char *utmp_id; + ExecUtmpMode utmp_mode; + + bool no_new_privileges; + + bool selinux_context_ignore; + bool apparmor_profile_ignore; + bool smack_process_label_ignore; + + char *selinux_context; + char *apparmor_profile; + char *smack_process_label; + + char **read_write_paths, **read_only_paths, **inaccessible_paths; + unsigned long mount_flags; + BindMount *bind_mounts; + size_t n_bind_mounts; + TemporaryFileSystem *temporary_filesystems; + size_t n_temporary_filesystems; + MountImage *mount_images; + size_t n_mount_images; + + uint64_t capability_bounding_set; + uint64_t capability_ambient_set; + int secure_bits; + + int syslog_priority; + bool syslog_level_prefix; + char *syslog_identifier; + + struct iovec* log_extra_fields; + size_t n_log_extra_fields; + + usec_t log_ratelimit_interval_usec; + unsigned log_ratelimit_burst; + + int log_level_max; + + char *log_namespace; + + ProtectProc protect_proc; /* hidepid= */ + ProcSubset proc_subset; /* subset= */ + + bool private_tmp; + bool private_network; + bool private_devices; + bool private_users; + bool private_mounts; + bool protect_kernel_tunables; + bool protect_kernel_modules; + bool protect_kernel_logs; + bool protect_clock; + bool protect_control_groups; + ProtectSystem protect_system; + ProtectHome protect_home; + bool protect_hostname; + bool mount_apivfs; + + bool dynamic_user; + bool remove_ipc; + + bool memory_deny_write_execute; + bool restrict_realtime; + bool restrict_suid_sgid; + + bool lock_personality; + unsigned long personality; + + unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */ + + Hashmap *syscall_filter; + Set *syscall_archs; + int syscall_errno; + bool syscall_allow_list:1; + + Hashmap *syscall_log; + bool syscall_log_allow_list:1; /* Log listed system calls */ + + bool address_families_allow_list:1; + Set *address_families; + + char *network_namespace_path; + + ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX]; + ExecPreserveMode runtime_directory_preserve_mode; + usec_t timeout_clean_usec; + + Hashmap *set_credentials; /* output id → ExecSetCredential */ + char **load_credentials; /* pairs of output id, path/input id */ +}; + +static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { + assert(c); + + return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL; +} + +static inline bool exec_context_with_rootfs(const ExecContext *c) { + assert(c); + + /* Checks if RootDirectory= or RootImage= are used */ + + return !empty_or_root(c->root_directory) || c->root_image; +} + +typedef enum ExecFlags { + EXEC_APPLY_SANDBOXING = 1 << 0, + EXEC_APPLY_CHROOT = 1 << 1, + EXEC_APPLY_TTY_STDIN = 1 << 2, + EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */ + EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */ + EXEC_NSS_BYPASS_BUS = 1 << 5, /* Set the SYSTEMD_NSS_BYPASS_BUS environment variable, to disable nss-systemd for dbus */ + EXEC_CGROUP_DELEGATE = 1 << 6, + EXEC_IS_CONTROL = 1 << 7, + EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */ + EXEC_WRITE_CREDENTIALS = 1 << 9, /* Set up the credential store logic */ + + /* The following are not used by execute.c, but by consumers internally */ + EXEC_PASS_FDS = 1 << 10, + EXEC_SETENV_RESULT = 1 << 11, + EXEC_SET_WATCHDOG = 1 << 12, +} ExecFlags; + +/* Parameters for a specific invocation of a command. This structure is put together right before a command is + * executed. */ +struct ExecParameters { + char **environment; + + int *fds; + char **fd_names; + size_t n_socket_fds; + size_t n_storage_fds; + + ExecFlags flags; + bool selinux_context_net:1; + + CGroupMask cgroup_supported; + const char *cgroup_path; + + char **prefix; + const char *received_credentials; + + const char *confirm_spawn; + + usec_t watchdog_usec; + + int *idle_pipe; + + int stdin_fd; + int stdout_fd; + int stderr_fd; + + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + int exec_fd; +}; + +#include "unit.h" +#include "dynamic-user.h" + +int exec_spawn(Unit *unit, + ExecCommand *command, + const ExecContext *context, + const ExecParameters *exec_params, + ExecRuntime *runtime, + DynamicCreds *dynamic_creds, + pid_t *ret); + +void exec_command_done_array(ExecCommand *c, size_t n); +ExecCommand* exec_command_free_list(ExecCommand *c); +void exec_command_free_array(ExecCommand **c, size_t n); +void exec_command_reset_status_array(ExecCommand *c, size_t n); +void exec_command_reset_status_list_array(ExecCommand **c, size_t n); +void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix); +void exec_command_append_list(ExecCommand **l, ExecCommand *e); +int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_; +int exec_command_append(ExecCommand *c, const char *path, ...) _sentinel_; + +void exec_context_init(ExecContext *c); +void exec_context_done(ExecContext *c); +void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix); + +int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root); +int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_root, const char *unit); + +const char* exec_context_fdname(const ExecContext *c, int fd_index); + +bool exec_context_may_touch_console(const ExecContext *c); +bool exec_context_maintains_privileges(const ExecContext *c); + +int exec_context_get_effective_ioprio(const ExecContext *c); +bool exec_context_get_effective_mount_apivfs(const ExecContext *c); + +void exec_context_free_log_extra_fields(ExecContext *c); + +void exec_context_revert_tty(ExecContext *c); + +int exec_context_get_clean_directories(ExecContext *c, char **prefix, ExecCleanMask mask, char ***ret); +int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret); + +void exec_status_start(ExecStatus *s, pid_t pid); +void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status); +void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix); +void exec_status_reset(ExecStatus *s); + +int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecRuntime **ret); +ExecRuntime *exec_runtime_unref(ExecRuntime *r, bool destroy); + +int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds); +int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds); +int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); +void exec_runtime_vacuum(Manager *m); + +void exec_params_clear(ExecParameters *p); + +bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c); + +ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSetCredential*, exec_set_credential_free); + +extern const struct hash_ops exec_set_credential_hash_ops; + +const char* exec_output_to_string(ExecOutput i) _const_; +ExecOutput exec_output_from_string(const char *s) _pure_; + +const char* exec_input_to_string(ExecInput i) _const_; +ExecInput exec_input_from_string(const char *s) _pure_; + +const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_; +ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_; + +const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_; +ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_; + +const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_; +ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_; + +const char* exec_directory_type_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_; + +const char* exec_resource_type_to_string(ExecDirectoryType i) _const_; +ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_; diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c new file mode 100644 index 0000000..9173951 --- /dev/null +++ b/src/core/generator-setup.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <unistd.h> + +#include "generator-setup.h" +#include "macro.h" +#include "mkdir.h" +#include "rm-rf.h" + +int lookup_paths_mkdir_generator(LookupPaths *p) { + int r, q; + + assert(p); + + if (!p->generator || !p->generator_early || !p->generator_late) + return -EINVAL; + + r = mkdir_p_label(p->generator, 0755); + + q = mkdir_p_label(p->generator_early, 0755); + if (q < 0 && r >= 0) + r = q; + + q = mkdir_p_label(p->generator_late, 0755); + if (q < 0 && r >= 0) + r = q; + + return r; +} + +void lookup_paths_trim_generator(LookupPaths *p) { + assert(p); + + /* Trim empty dirs */ + + if (p->generator) + (void) rmdir(p->generator); + if (p->generator_early) + (void) rmdir(p->generator_early); + if (p->generator_late) + (void) rmdir(p->generator_late); +} + +void lookup_paths_flush_generator(LookupPaths *p) { + assert(p); + + /* Flush the generated unit files in full */ + + if (p->generator) + (void) rm_rf(p->generator, REMOVE_ROOT|REMOVE_PHYSICAL); + if (p->generator_early) + (void) rm_rf(p->generator_early, REMOVE_ROOT|REMOVE_PHYSICAL); + if (p->generator_late) + (void) rm_rf(p->generator_late, REMOVE_ROOT|REMOVE_PHYSICAL); + + if (p->temporary_dir) + (void) rm_rf(p->temporary_dir, REMOVE_ROOT|REMOVE_PHYSICAL); +} diff --git a/src/core/generator-setup.h b/src/core/generator-setup.h new file mode 100644 index 0000000..1cc816b --- /dev/null +++ b/src/core/generator-setup.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "path-lookup.h" + +int lookup_paths_mkdir_generator(LookupPaths *p); +void lookup_paths_trim_generator(LookupPaths *p); +void lookup_paths_flush_generator(LookupPaths *p); diff --git a/src/core/hostname-setup.c b/src/core/hostname-setup.c new file mode 100644 index 0000000..867ea19 --- /dev/null +++ b/src/core/hostname-setup.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> + +#include "alloc-util.h" +#include "fileio.h" +#include "hostname-setup.h" +#include "hostname-util.h" +#include "log.h" +#include "macro.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "util.h" + +int hostname_setup(void) { + _cleanup_free_ char *b = NULL; + const char *hn = NULL; + bool enoent = false; + int r; + + r = proc_cmdline_get_key("systemd.hostname", 0, &b); + if (r < 0) + log_warning_errno(r, "Failed to retrieve system hostname from kernel command line, ignoring: %m"); + else if (r > 0) { + if (hostname_is_valid(b, true)) + hn = b; + else { + log_warning("Hostname specified on kernel command line is invalid, ignoring: %s", b); + b = mfree(b); + } + } + + if (!hn) { + r = read_etc_hostname(NULL, &b); + if (r < 0) { + if (r == -ENOENT) + enoent = true; + else + log_warning_errno(r, "Failed to read configured hostname: %m"); + } else + hn = b; + } + + if (isempty(hn)) { + /* Don't override the hostname if it is already set and not explicitly configured */ + if (hostname_is_set()) + return 0; + + if (enoent) + log_info("No hostname configured."); + + hn = FALLBACK_HOSTNAME; + } + + r = sethostname_idempotent(hn); + if (r < 0) + return log_warning_errno(r, "Failed to set hostname to <%s>: %m", hn); + + log_info("Set hostname to <%s>.", hn); + return 0; +} diff --git a/src/core/hostname-setup.h b/src/core/hostname-setup.h new file mode 100644 index 0000000..7fd0a02 --- /dev/null +++ b/src/core/hostname-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int hostname_setup(void); diff --git a/src/core/ima-setup.c b/src/core/ima-setup.c new file mode 100644 index 0000000..7f517a0 --- /dev/null +++ b/src/core/ima-setup.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy + TORSEC group — http://security.polito.it +***/ + +#include <errno.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "ima-setup.h" +#include "log.h" + +#define IMA_SECFS_DIR "/sys/kernel/security/ima" +#define IMA_SECFS_POLICY IMA_SECFS_DIR "/policy" +#define IMA_POLICY_PATH "/etc/ima/ima-policy" + +int ima_setup(void) { +#if ENABLE_IMA + _cleanup_fclose_ FILE *input = NULL; + _cleanup_close_ int imafd = -1; + unsigned lineno = 0; + int r; + + if (access(IMA_SECFS_DIR, F_OK) < 0) { + log_debug_errno(errno, "IMA support is disabled in the kernel, ignoring: %m"); + return 0; + } + + if (access(IMA_SECFS_POLICY, W_OK) < 0) { + log_warning_errno(errno, "Another IMA custom policy has already been loaded, ignoring: %m"); + return 0; + } + + if (access(IMA_POLICY_PATH, F_OK) < 0) { + log_debug_errno(errno, "No IMA custom policy file "IMA_POLICY_PATH", ignoring: %m"); + return 0; + } + + imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC); + if (imafd < 0) { + log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m"); + return 0; + } + + /* attempt to write the name of the policy file into sysfs file */ + if (write(imafd, IMA_POLICY_PATH, STRLEN(IMA_POLICY_PATH)) > 0) + goto done; + + /* fall back to copying the policy line-by-line */ + input = fopen(IMA_POLICY_PATH, "re"); + if (!input) { + log_warning_errno(errno, "Failed to open the IMA custom policy file "IMA_POLICY_PATH", ignoring: %m"); + return 0; + } + + safe_close(imafd); + + imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC); + if (imafd < 0) { + log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m"); + return 0; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + size_t len; + + r = read_line(input, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read the IMA custom policy file "IMA_POLICY_PATH": %m"); + if (r == 0) + break; + + len = strlen(line); + lineno++; + + if (len > 0 && write(imafd, line, len) < 0) + return log_error_errno(errno, "Failed to load the IMA custom policy file "IMA_POLICY_PATH"%u: %m", + lineno); + } + +done: + log_info("Successfully loaded the IMA custom policy "IMA_POLICY_PATH"."); +#endif /* ENABLE_IMA */ + return 0; +} diff --git a/src/core/ima-setup.h b/src/core/ima-setup.h new file mode 100644 index 0000000..f964c7b --- /dev/null +++ b/src/core/ima-setup.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy + TORSEC group — http://security.polito.it +***/ + +int ima_setup(void); diff --git a/src/core/ip-address-access.c b/src/core/ip-address-access.c new file mode 100644 index 0000000..a11251e --- /dev/null +++ b/src/core/ip-address-access.c @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdio.h> +#include <stdlib.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "extract-word.h" +#include "hostname-util.h" +#include "ip-address-access.h" +#include "parse-util.h" +#include "string-util.h" + +int config_parse_ip_address_access( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + IPAddressAccessItem **list = data; + const char *p; + int r; + + assert(list); + + if (isempty(rvalue)) { + *list = ip_address_access_free_all(*list); + return 0; + } + + p = rvalue; + + for (;;) { + _cleanup_free_ IPAddressAccessItem *a = NULL; + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + if (streq(word, "any")) { + /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */ + + a->family = AF_INET; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + + } else if (is_localhost(word)) { + /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32(0x7f000000); + a->prefixlen = 8; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) IN6ADDR_LOOPBACK_INIT; + a->prefixlen = 128; + + } else if (streq(word, "link-local")) { + + /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32((UINT32_C(169) << 24 | UINT32_C(254) << 16)); + a->prefixlen = 16; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) { + .s6_addr32[0] = htobe32(0xfe800000) + }; + a->prefixlen = 64; + + } else if (streq(word, "multicast")) { + + /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32((UINT32_C(224) << 24)); + a->prefixlen = 4; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) { + .s6_addr32[0] = htobe32(0xff000000) + }; + a->prefixlen = 8; + + } else { + r = in_addr_prefix_from_string_auto(word, &a->family, &a->address, &a->prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Address prefix is invalid, ignoring assignment: %s", word); + return 0; + } + } + + LIST_APPEND(items, *list, a); + a = NULL; + } + + *list = ip_address_access_reduce(*list); + + return 0; +} + +IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first) { + IPAddressAccessItem *next, *p = first; + + while (p) { + next = p->items_next; + free(p); + + p = next; + } + + return NULL; +} + +IPAddressAccessItem* ip_address_access_reduce(IPAddressAccessItem *first) { + IPAddressAccessItem *a, *b, *tmp; + int r; + + /* Drops all entries from the list that are covered by another entry in full, thus removing all redundant + * entries. */ + + LIST_FOREACH_SAFE(items, a, tmp, first) { + + /* Drop irrelevant bits */ + (void) in_addr_mask(a->family, &a->address, a->prefixlen); + + LIST_FOREACH(items, b, first) { + + if (a == b) + continue; + + if (a->family != b->family) + continue; + + if (b->prefixlen > a->prefixlen) + continue; + + r = in_addr_prefix_covers(b->family, + &b->address, + b->prefixlen, + &a->address); + if (r > 0) { + /* b covers a fully, then let's drop a */ + LIST_REMOVE(items, first, a); + free(a); + break; + } + } + } + + return first; +} + +bool ip_address_access_item_is_any(IPAddressAccessItem *first) { + /* Check for exactly two entries */ + if (!first || !first->items_next || first->items_next->items_next) + return false; + + /* Check both entries cover the full range */ + if (first->prefixlen != 0 || first->items_next->prefixlen != 0) + return false; + + /* Check that one of them is the IPv4 and the other IPv6 */ + if (!((first->family == AF_INET && first->items_next->family == AF_INET6) || + (first->family == AF_INET6 && first->items_next->family == AF_INET))) + return false; + + /* No need to check the actual addresses, they don't matter if the prefix is zero */ + return true; +} diff --git a/src/core/ip-address-access.h b/src/core/ip-address-access.h new file mode 100644 index 0000000..71b5459 --- /dev/null +++ b/src/core/ip-address-access.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "list.h" + +typedef struct IPAddressAccessItem IPAddressAccessItem; + +struct IPAddressAccessItem { + int family; + unsigned char prefixlen; + union in_addr_union address; + LIST_FIELDS(IPAddressAccessItem, items); +}; + +CONFIG_PARSER_PROTOTYPE(config_parse_ip_address_access); + +IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first); + +IPAddressAccessItem* ip_address_access_reduce(IPAddressAccessItem *first); + +/* Returns true if a list consists of only the two items necessary for "any" + * (0.0.0.0/0 and ::/0). */ +bool ip_address_access_item_is_any(IPAddressAccessItem *first); diff --git a/src/core/job.c b/src/core/job.c new file mode 100644 index 0000000..f3c1a02 --- /dev/null +++ b/src/core/job.c @@ -0,0 +1,1698 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> + +#include "sd-id128.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "cgroup.h" +#include "dbus-job.h" +#include "dbus.h" +#include "escape.h" +#include "fileio.h" +#include "job.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "serialize.h" +#include "set.h" +#include "sort-util.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "unit.h" +#include "virt.h" + +Job* job_new_raw(Unit *unit) { + Job *j; + + /* used for deserialization */ + + assert(unit); + + j = new(Job, 1); + if (!j) + return NULL; + + *j = (Job) { + .manager = unit->manager, + .unit = unit, + .type = _JOB_TYPE_INVALID, + }; + + return j; +} + +Job* job_new(Unit *unit, JobType type) { + Job *j; + + assert(type < _JOB_TYPE_MAX); + + j = job_new_raw(unit); + if (!j) + return NULL; + + j->id = j->manager->current_job_id++; + j->type = type; + + /* We don't link it here, that's what job_dependency() is for */ + + return j; +} + +void job_unlink(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + if (j->in_run_queue) { + prioq_remove(j->manager->run_queue, j, &j->run_queue_idx); + j->in_run_queue = false; + } + + if (j->in_dbus_queue) { + LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + } + + if (j->in_gc_queue) { + LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j); + j->in_gc_queue = false; + } + + j->timer_event_source = sd_event_source_unref(j->timer_event_source); +} + +Job* job_free(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + job_unlink(j); + + sd_bus_track_unref(j->bus_track); + strv_free(j->deserialized_clients); + + return mfree(j); +} + +static void job_set_state(Job *j, JobState state) { + assert(j); + assert(state >= 0); + assert(state < _JOB_STATE_MAX); + + if (j->state == state) + return; + + j->state = state; + + if (!j->installed) + return; + + if (j->state == JOB_RUNNING) + j->unit->manager->n_running_jobs++; + else { + assert(j->state == JOB_WAITING); + assert(j->unit->manager->n_running_jobs > 0); + + j->unit->manager->n_running_jobs--; + + if (j->unit->manager->n_running_jobs <= 0) + j->unit->manager->jobs_in_progress_event_source = sd_event_source_unref(j->unit->manager->jobs_in_progress_event_source); + } +} + +void job_uninstall(Job *j) { + Job **pj; + + assert(j->installed); + + job_set_state(j, JOB_WAITING); + + pj = (j->type == JOB_NOP) ? &j->unit->nop_job : &j->unit->job; + assert(*pj == j); + + /* Detach from next 'bigger' objects */ + + /* daemon-reload should be transparent to job observers */ + if (!MANAGER_IS_RELOADING(j->manager)) + bus_job_send_removed_signal(j); + + *pj = NULL; + + unit_add_to_gc_queue(j->unit); + + unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */ + + hashmap_remove_value(j->manager->jobs, UINT32_TO_PTR(j->id), j); + j->installed = false; +} + +static bool job_type_allows_late_merge(JobType t) { + /* Tells whether it is OK to merge a job of type 't' with an already + * running job. + * Reloads cannot be merged this way. Think of the sequence: + * 1. Reload of a daemon is in progress; the daemon has already loaded + * its config file, but hasn't completed the reload operation yet. + * 2. Edit foo's config file. + * 3. Trigger another reload to have the daemon use the new config. + * Should the second reload job be merged into the first one, the daemon + * would not know about the new config. + * JOB_RESTART jobs on the other hand can be merged, because they get + * patched into JOB_START after stopping the unit. So if we see a + * JOB_RESTART running, it means the unit hasn't stopped yet and at + * this time the merge is still allowed. */ + return t != JOB_RELOAD; +} + +static void job_merge_into_installed(Job *j, Job *other) { + assert(j->installed); + assert(j->unit == other->unit); + + if (j->type != JOB_NOP) + assert_se(job_type_merge_and_collapse(&j->type, other->type, j->unit) == 0); + else + assert(other->type == JOB_NOP); + + j->irreversible = j->irreversible || other->irreversible; + j->ignore_order = j->ignore_order || other->ignore_order; +} + +Job* job_install(Job *j) { + Job **pj; + Job *uj; + + assert(!j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(j->state == JOB_WAITING); + + pj = (j->type == JOB_NOP) ? &j->unit->nop_job : &j->unit->job; + uj = *pj; + + if (uj) { + if (job_type_is_conflicting(uj->type, j->type)) + job_finish_and_invalidate(uj, JOB_CANCELED, false, false); + else { + /* not conflicting, i.e. mergeable */ + + if (uj->state == JOB_WAITING || + (job_type_allows_late_merge(j->type) && job_type_is_superset(uj->type, j->type))) { + job_merge_into_installed(uj, j); + log_unit_debug(uj->unit, + "Merged %s/%s into installed job %s/%s as %"PRIu32, + j->unit->id, job_type_to_string(j->type), uj->unit->id, + job_type_to_string(uj->type), uj->id); + return uj; + } else { + /* already running and not safe to merge into */ + /* Patch uj to become a merged job and re-run it. */ + /* XXX It should be safer to queue j to run after uj finishes, but it is + * not currently possible to have more than one installed job per unit. */ + job_merge_into_installed(uj, j); + log_unit_debug(uj->unit, + "Merged into running job, re-running: %s/%s as %"PRIu32, + uj->unit->id, job_type_to_string(uj->type), uj->id); + + job_set_state(uj, JOB_WAITING); + return uj; + } + } + } + + /* Install the job */ + *pj = j; + j->installed = true; + + j->manager->n_installed_jobs++; + log_unit_debug(j->unit, + "Installed new job %s/%s as %u", + j->unit->id, job_type_to_string(j->type), (unsigned) j->id); + + job_add_to_gc_queue(j); + + job_add_to_dbus_queue(j); /* announce this job to clients */ + unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */ + + return j; +} + +int job_install_deserialized(Job *j) { + Job **pj; + int r; + + assert(!j->installed); + + if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION) + return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EINVAL), + "Invalid job type %s in deserialization.", + strna(job_type_to_string(j->type))); + + pj = (j->type == JOB_NOP) ? &j->unit->nop_job : &j->unit->job; + if (*pj) + return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EEXIST), + "Unit already has a job installed. Not installing deserialized job."); + + r = hashmap_ensure_allocated(&j->manager->jobs, NULL); + if (r < 0) + return r; + + r = hashmap_put(j->manager->jobs, UINT32_TO_PTR(j->id), j); + if (r == -EEXIST) + return log_unit_debug_errno(j->unit, r, "Job ID %" PRIu32 " already used, cannot deserialize job.", j->id); + if (r < 0) + return log_unit_debug_errno(j->unit, r, "Failed to insert job into jobs hash table: %m"); + + *pj = j; + j->installed = true; + + if (j->state == JOB_RUNNING) + j->unit->manager->n_running_jobs++; + + log_unit_debug(j->unit, + "Reinstalled deserialized job %s/%s as %u", + j->unit->id, job_type_to_string(j->type), (unsigned) j->id); + return 0; +} + +JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts) { + JobDependency *l; + + assert(object); + + /* Adds a new job link, which encodes that the 'subject' job + * needs the 'object' job in some way. If 'subject' is NULL + * this means the 'anchor' job (i.e. the one the user + * explicitly asked for) is the requester. */ + + l = new0(JobDependency, 1); + if (!l) + return NULL; + + l->subject = subject; + l->object = object; + l->matters = matters; + l->conflicts = conflicts; + + if (subject) + LIST_PREPEND(subject, subject->subject_list, l); + + LIST_PREPEND(object, object->object_list, l); + + return l; +} + +void job_dependency_free(JobDependency *l) { + assert(l); + + if (l->subject) + LIST_REMOVE(subject, l->subject->subject_list, l); + + LIST_REMOVE(object, l->object->object_list, l); + + free(l); +} + +void job_dump(Job *j, FILE *f, const char *prefix) { + assert(j); + assert(f); + + prefix = strempty(prefix); + + fprintf(f, + "%s-> Job %u:\n" + "%s\tAction: %s -> %s\n" + "%s\tState: %s\n" + "%s\tIrreversible: %s\n" + "%s\tMay GC: %s\n", + prefix, j->id, + prefix, j->unit->id, job_type_to_string(j->type), + prefix, job_state_to_string(j->state), + prefix, yes_no(j->irreversible), + prefix, yes_no(job_may_gc(j))); +} + +/* + * Merging is commutative, so imagine the matrix as symmetric. We store only + * its lower triangle to avoid duplication. We don't store the main diagonal, + * because A merged with A is simply A. + * + * If the resulting type is collapsed immediately afterwards (to get rid of + * the JOB_RELOAD_OR_START, which lies outside the lookup function's domain), + * the following properties hold: + * + * Merging is associative! A merged with B, and then merged with C is the same + * as A merged with the result of B merged with C. + * + * Mergeability is transitive! If A can be merged with B and B with C then + * A also with C. + * + * Also, if A merged with B cannot be merged with C, then either A or B cannot + * be merged with C either. + */ +static const JobType job_merging_table[] = { +/* What \ With * JOB_START JOB_VERIFY_ACTIVE JOB_STOP JOB_RELOAD */ +/*********************************************************************************/ +/*JOB_START */ +/*JOB_VERIFY_ACTIVE */ JOB_START, +/*JOB_STOP */ -1, -1, +/*JOB_RELOAD */ JOB_RELOAD_OR_START, JOB_RELOAD, -1, +/*JOB_RESTART */ JOB_RESTART, JOB_RESTART, -1, JOB_RESTART, +}; + +JobType job_type_lookup_merge(JobType a, JobType b) { + assert_cc(ELEMENTSOF(job_merging_table) == _JOB_TYPE_MAX_MERGING * (_JOB_TYPE_MAX_MERGING - 1) / 2); + assert(a >= 0 && a < _JOB_TYPE_MAX_MERGING); + assert(b >= 0 && b < _JOB_TYPE_MAX_MERGING); + + if (a == b) + return a; + + if (a < b) { + JobType tmp = a; + a = b; + b = tmp; + } + + return job_merging_table[(a - 1) * a / 2 + b]; +} + +bool job_type_is_redundant(JobType a, UnitActiveState b) { + switch (a) { + + case JOB_START: + return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING); + + case JOB_STOP: + return IN_SET(b, UNIT_INACTIVE, UNIT_FAILED); + + case JOB_VERIFY_ACTIVE: + return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING); + + case JOB_RELOAD: + return + b == UNIT_RELOADING; + + case JOB_RESTART: + return + b == UNIT_ACTIVATING; + + case JOB_NOP: + return true; + + default: + assert_not_reached("Invalid job type"); + } +} + +JobType job_type_collapse(JobType t, Unit *u) { + UnitActiveState s; + + switch (t) { + + case JOB_TRY_RESTART: + s = unit_active_state(u); + if (!UNIT_IS_ACTIVE_OR_RELOADING(s)) + return JOB_NOP; + + return JOB_RESTART; + + case JOB_TRY_RELOAD: + s = unit_active_state(u); + if (!UNIT_IS_ACTIVE_OR_RELOADING(s)) + return JOB_NOP; + + return JOB_RELOAD; + + case JOB_RELOAD_OR_START: + s = unit_active_state(u); + if (!UNIT_IS_ACTIVE_OR_RELOADING(s)) + return JOB_START; + + return JOB_RELOAD; + + default: + return t; + } +} + +int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u) { + JobType t; + + t = job_type_lookup_merge(*a, b); + if (t < 0) + return -EEXIST; + + *a = job_type_collapse(t, u); + return 0; +} + +static bool job_is_runnable(Job *j) { + Unit *other; + void *v; + + assert(j); + assert(j->installed); + + /* Checks whether there is any job running for the units this + * job needs to be running after (in the case of a 'positive' + * job type) or before (in the case of a 'negative' job + * type. */ + + /* Note that unit types have a say in what is runnable, + * too. For example, if they return -EAGAIN from + * unit_start() they can indicate they are not + * runnable yet. */ + + /* First check if there is an override */ + if (j->ignore_order) + return true; + + if (j->type == JOB_NOP) + return true; + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER]) + if (other->job && job_compare(j, other->job, UNIT_AFTER) > 0) { + log_unit_debug(j->unit, + "starting held back, waiting for: %s", + other->id); + return false; + } + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE]) + if (other->job && job_compare(j, other->job, UNIT_BEFORE) > 0) { + log_unit_debug(j->unit, + "stopping held back, waiting for: %s", + other->id); + return false; + } + + return true; +} + +static void job_change_type(Job *j, JobType newtype) { + assert(j); + + log_unit_debug(j->unit, + "Converting job %s/%s -> %s/%s", + j->unit->id, job_type_to_string(j->type), + j->unit->id, job_type_to_string(newtype)); + + j->type = newtype; +} + +_pure_ static const char* job_get_begin_status_message_format(Unit *u, JobType t) { + const char *format; + + assert(u); + + if (t == JOB_RELOAD) + return "Reloading %s."; + + assert(IN_SET(t, JOB_START, JOB_STOP)); + + format = UNIT_VTABLE(u)->status_message_formats.starting_stopping[t == JOB_STOP]; + if (format) + return format; + + /* Return generic strings */ + if (t == JOB_START) + return "Starting %s."; + else { + assert(t == JOB_STOP); + return "Stopping %s."; + } +} + +static void job_print_begin_status_message(Unit *u, JobType t) { + const char *format; + + assert(u); + + /* Reload status messages have traditionally not been printed to console. */ + if (!IN_SET(t, JOB_START, JOB_STOP)) + return; + + format = job_get_begin_status_message_format(u, t); + + DISABLE_WARNING_FORMAT_NONLITERAL; + unit_status_printf(u, STATUS_TYPE_NORMAL, "", format); + REENABLE_WARNING; +} + +static void job_log_begin_status_message(Unit *u, uint32_t job_id, JobType t) { + const char *format, *mid; + char buf[LINE_MAX]; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (!IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD)) + return; + + if (log_on_console()) /* Skip this if it would only go on the console anyway */ + return; + + /* We log status messages for all units and all operations. */ + + format = job_get_begin_status_message_format(u, t); + + DISABLE_WARNING_FORMAT_NONLITERAL; + (void) snprintf(buf, sizeof buf, format, unit_status_string(u)); + REENABLE_WARNING; + + mid = t == JOB_START ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTING_STR : + t == JOB_STOP ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPING_STR : + "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADING_STR; + + /* Note that we deliberately use LOG_MESSAGE() instead of + * LOG_UNIT_MESSAGE() here, since this is supposed to mimic + * closely what is written to screen using the status output, + * which is supposed the highest level, friendliest output + * possible, which means we should avoid the low-level unit + * name. */ + log_struct(LOG_INFO, + LOG_MESSAGE("%s", buf), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + mid); +} + +static void job_emit_begin_status_message(Unit *u, uint32_t job_id, JobType t) { + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + job_log_begin_status_message(u, job_id, t); + job_print_begin_status_message(u, t); +} + +static int job_perform_on_unit(Job **j) { + uint32_t id; + Manager *m; + JobType t; + Unit *u; + int r; + + /* While we execute this operation the job might go away (for + * example: because it finishes immediately or is replaced by + * a new, conflicting job.) To make sure we don't access a + * freed job later on we store the id here, so that we can + * verify the job is still valid. */ + + assert(j); + assert(*j); + + m = (*j)->manager; + u = (*j)->unit; + t = (*j)->type; + id = (*j)->id; + + switch (t) { + case JOB_START: + r = unit_start(u); + break; + + case JOB_RESTART: + t = JOB_STOP; + _fallthrough_; + case JOB_STOP: + r = unit_stop(u); + break; + + case JOB_RELOAD: + r = unit_reload(u); + break; + + default: + assert_not_reached("Invalid job type"); + } + + /* Log if the job still exists and the start/stop/reload function actually did something. Note that this means + * for units for which there's no 'activating' phase (i.e. because we transition directly from 'inactive' to + * 'active') we'll possibly skip the "Starting..." message. */ + *j = manager_get_job(m, id); + if (*j && r > 0) + job_emit_begin_status_message(u, id, t); + + return r; +} + +int job_run_and_invalidate(Job *j) { + int r; + + assert(j); + assert(j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(j->in_run_queue); + + prioq_remove(j->manager->run_queue, j, &j->run_queue_idx); + j->in_run_queue = false; + + if (j->state != JOB_WAITING) + return 0; + + if (!job_is_runnable(j)) + return -EAGAIN; + + job_start_timer(j, true); + job_set_state(j, JOB_RUNNING); + job_add_to_dbus_queue(j); + + switch (j->type) { + + case JOB_VERIFY_ACTIVE: { + UnitActiveState t; + + t = unit_active_state(j->unit); + if (UNIT_IS_ACTIVE_OR_RELOADING(t)) + r = -EALREADY; + else if (t == UNIT_ACTIVATING) + r = -EAGAIN; + else + r = -EBADR; + break; + } + + case JOB_START: + case JOB_STOP: + case JOB_RESTART: + r = job_perform_on_unit(&j); + + /* If the unit type does not support starting/stopping, then simply wait. */ + if (r == -EBADR) + r = 0; + break; + + case JOB_RELOAD: + r = job_perform_on_unit(&j); + break; + + case JOB_NOP: + r = -EALREADY; + break; + + default: + assert_not_reached("Unknown job type"); + } + + if (j) { + if (r == -EAGAIN) + job_set_state(j, JOB_WAITING); /* Hmm, not ready after all, let's return to JOB_WAITING state */ + else if (r == -EALREADY) /* already being executed */ + r = job_finish_and_invalidate(j, JOB_DONE, true, true); + else if (r == -ECOMM) /* condition failed, but all is good */ + r = job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (r == -EBADR) + r = job_finish_and_invalidate(j, JOB_SKIPPED, true, false); + else if (r == -ENOEXEC) + r = job_finish_and_invalidate(j, JOB_INVALID, true, false); + else if (r == -EPROTO) + r = job_finish_and_invalidate(j, JOB_ASSERT, true, false); + else if (r == -EOPNOTSUPP) + r = job_finish_and_invalidate(j, JOB_UNSUPPORTED, true, false); + else if (r == -ENOLINK) + r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); + else if (r == -ESTALE) + r = job_finish_and_invalidate(j, JOB_ONCE, true, false); + else if (r < 0) + r = job_finish_and_invalidate(j, JOB_FAILED, true, false); + } + + return r; +} + +_pure_ static const char *job_get_done_status_message_format(Unit *u, JobType t, JobResult result) { + + static const char *const generic_finished_start_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Started %s.", + [JOB_TIMEOUT] = "Timed out starting %s.", + [JOB_FAILED] = "Failed to start %s.", + [JOB_DEPENDENCY] = "Dependency failed for %s.", + [JOB_ASSERT] = "Assertion failed for %s.", + [JOB_UNSUPPORTED] = "Starting of %s not supported.", + [JOB_COLLECTED] = "Unnecessary job for %s was removed.", + [JOB_ONCE] = "Unit %s has been started before and cannot be started again." + }; + static const char *const generic_finished_stop_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Stopped %s.", + [JOB_FAILED] = "Stopped (with error) %s.", + [JOB_TIMEOUT] = "Timed out stopping %s.", + }; + static const char *const generic_finished_reload_job[_JOB_RESULT_MAX] = { + [JOB_DONE] = "Reloaded %s.", + [JOB_FAILED] = "Reload failed for %s.", + [JOB_TIMEOUT] = "Timed out reloading %s.", + }; + /* When verify-active detects the unit is inactive, report it. + * Most likely a DEPEND warning from a requisiting unit will + * occur next and it's nice to see what was requisited. */ + static const char *const generic_finished_verify_active_job[_JOB_RESULT_MAX] = { + [JOB_SKIPPED] = "%s is not active.", + }; + + const char *format; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (IN_SET(t, JOB_START, JOB_STOP, JOB_RESTART)) { + const UnitStatusMessageFormats *formats = &UNIT_VTABLE(u)->status_message_formats; + if (formats->finished_job) { + format = formats->finished_job(u, t, result); + if (format) + return format; + } + format = t == JOB_START ? + formats->finished_start_job[result] : + formats->finished_stop_job[result]; + if (format) + return format; + } + + /* Return generic strings */ + if (t == JOB_START) + return generic_finished_start_job[result]; + else if (IN_SET(t, JOB_STOP, JOB_RESTART)) + return generic_finished_stop_job[result]; + else if (t == JOB_RELOAD) + return generic_finished_reload_job[result]; + else if (t == JOB_VERIFY_ACTIVE) + return generic_finished_verify_active_job[result]; + + return NULL; +} + +static const struct { + const char *color, *word; +} job_print_done_status_messages[_JOB_RESULT_MAX] = { + [JOB_DONE] = { ANSI_OK_COLOR, " OK " }, + [JOB_TIMEOUT] = { ANSI_HIGHLIGHT_RED, " TIME " }, + [JOB_FAILED] = { ANSI_HIGHLIGHT_RED, "FAILED" }, + [JOB_DEPENDENCY] = { ANSI_HIGHLIGHT_YELLOW, "DEPEND" }, + [JOB_SKIPPED] = { ANSI_HIGHLIGHT, " INFO " }, + [JOB_ASSERT] = { ANSI_HIGHLIGHT_YELLOW, "ASSERT" }, + [JOB_UNSUPPORTED] = { ANSI_HIGHLIGHT_YELLOW, "UNSUPP" }, + /* JOB_COLLECTED */ + [JOB_ONCE] = { ANSI_HIGHLIGHT_RED, " ONCE " }, +}; + +static void job_print_done_status_message(Unit *u, JobType t, JobResult result) { + const char *format; + const char *status; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + /* Reload status messages have traditionally not been printed to console. */ + if (t == JOB_RELOAD) + return; + + /* No message if the job did not actually do anything due to failed condition. */ + if (t == JOB_START && result == JOB_DONE && !u->condition_result) + return; + + if (!job_print_done_status_messages[result].word) + return; + + format = job_get_done_status_message_format(u, t, result); + if (!format) + return; + + if (log_get_show_color()) + status = strjoina(job_print_done_status_messages[result].color, + job_print_done_status_messages[result].word, + ANSI_NORMAL); + else + status = job_print_done_status_messages[result].word; + + DISABLE_WARNING_FORMAT_NONLITERAL; + unit_status_printf(u, + result == JOB_DONE ? STATUS_TYPE_NORMAL : STATUS_TYPE_NOTICE, + status, format); + REENABLE_WARNING; + + if (t == JOB_START && result == JOB_FAILED) { + _cleanup_free_ char *quoted; + + quoted = shell_maybe_quote(u->id, ESCAPE_BACKSLASH); + manager_status_printf(u->manager, STATUS_TYPE_NORMAL, NULL, "See 'systemctl status %s' for details.", strna(quoted)); + } +} + +static void job_log_done_status_message(Unit *u, uint32_t job_id, JobType t, JobResult result) { + const char *format, *mid; + char buf[LINE_MAX]; + static const int job_result_log_level[_JOB_RESULT_MAX] = { + [JOB_DONE] = LOG_INFO, + [JOB_CANCELED] = LOG_INFO, + [JOB_TIMEOUT] = LOG_ERR, + [JOB_FAILED] = LOG_ERR, + [JOB_DEPENDENCY] = LOG_WARNING, + [JOB_SKIPPED] = LOG_NOTICE, + [JOB_INVALID] = LOG_INFO, + [JOB_ASSERT] = LOG_WARNING, + [JOB_UNSUPPORTED] = LOG_WARNING, + [JOB_COLLECTED] = LOG_INFO, + [JOB_ONCE] = LOG_ERR, + }; + + assert(u); + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + /* Skip printing if output goes to the console, and job_print_status_message() + will actually print something to the console. */ + if (log_on_console() && job_print_done_status_messages[result].word) + return; + + /* Show condition check message if the job did not actually do anything due to failed condition. */ + if ((t == JOB_START && result == JOB_DONE && !u->condition_result) || + (t == JOB_START && result == JOB_SKIPPED)) { + log_struct(LOG_INFO, + "MESSAGE=Condition check resulted in %s being skipped.", unit_status_string(u), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR); + + return; + } + + format = job_get_done_status_message_format(u, t, result); + if (!format) + return; + + /* The description might be longer than the buffer, but that's OK, + * we'll just truncate it here. Note that we use snprintf() rather than + * xsprintf() on purpose here: we are fine with truncation and don't + * consider that an error. */ + DISABLE_WARNING_FORMAT_NONLITERAL; + (void) snprintf(buf, sizeof(buf), format, unit_status_string(u)); + REENABLE_WARNING; + + switch (t) { + + case JOB_START: + if (result == JOB_DONE) + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR; + else + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILED_STR; + break; + + case JOB_RELOAD: + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADED_STR; + break; + + case JOB_STOP: + case JOB_RESTART: + mid = "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPED_STR; + break; + + default: + log_struct(job_result_log_level[result], + LOG_MESSAGE("%s", buf), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u)); + return; + } + + log_struct(job_result_log_level[result], + LOG_MESSAGE("%s", buf), + "JOB_ID=%" PRIu32, job_id, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + mid); +} + +static void job_emit_done_status_message(Unit *u, uint32_t job_id, JobType t, JobResult result) { + assert(u); + + job_log_done_status_message(u, job_id, t, result); + job_print_done_status_message(u, t, result); +} + +static void job_fail_dependencies(Unit *u, UnitDependency d) { + Unit *other; + void *v; + + assert(u); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[d]) { + Job *j = other->job; + + if (!j) + continue; + if (!IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE)) + continue; + + job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); + } +} + +int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) { + Unit *u; + Unit *other; + JobType t; + void *v; + + assert(j); + assert(j->installed); + assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION); + + u = j->unit; + t = j->type; + + j->result = result; + + log_unit_debug(u, "Job %" PRIu32 " %s/%s finished, result=%s", + j->id, u->id, job_type_to_string(t), job_result_to_string(result)); + + /* If this job did nothing to the respective unit we don't log the status message */ + if (!already) + job_emit_done_status_message(u, j->id, t, result); + + /* Patch restart jobs so that they become normal start jobs */ + if (result == JOB_DONE && t == JOB_RESTART) { + + job_change_type(j, JOB_START); + job_set_state(j, JOB_WAITING); + + job_add_to_dbus_queue(j); + job_add_to_run_queue(j); + job_add_to_gc_queue(j); + + goto finish; + } + + if (IN_SET(result, JOB_FAILED, JOB_INVALID)) + j->manager->n_failed_jobs++; + + job_uninstall(j); + job_free(j); + + /* Fail depending jobs on failure */ + if (result != JOB_DONE && recursive) { + if (IN_SET(t, JOB_START, JOB_VERIFY_ACTIVE)) { + job_fail_dependencies(u, UNIT_REQUIRED_BY); + job_fail_dependencies(u, UNIT_REQUISITE_OF); + job_fail_dependencies(u, UNIT_BOUND_BY); + } else if (t == JOB_STOP) + job_fail_dependencies(u, UNIT_CONFLICTED_BY); + } + + /* A special check to make sure we take down anything RequisiteOf if we + * aren't active. This is when the verify-active job merges with a + * satisfying job type, and then loses it's invalidation effect, as the + * result there is JOB_DONE for the start job we merged into, while we + * should be failing the depending job if the said unit isn't in fact + * active. Oneshots are an example of this, where going directly from + * activating to inactive is success. + * + * This happens when you use ConditionXYZ= in a unit too, since in that + * case the job completes with the JOB_DONE result, but the unit never + * really becomes active. Note that such a case still involves merging: + * + * A start job waits for something else, and a verify-active comes in + * and merges in the installed job. Then, later, when it becomes + * runnable, it finishes with JOB_DONE result as execution on conditions + * not being met is skipped, breaking our dependency semantics. + * + * Also, depending on if start job waits or not, the merging may or may + * not happen (the verify-active job may trigger after it finishes), so + * you get undeterministic results without this check. + */ + if (result == JOB_DONE && recursive && !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) { + if (IN_SET(t, JOB_START, JOB_RELOAD)) + job_fail_dependencies(u, UNIT_REQUISITE_OF); + } + /* Trigger OnFailure dependencies that are not generated by + * the unit itself. We don't treat JOB_CANCELED as failure in + * this context. And JOB_FAILURE is already handled by the + * unit itself. */ + if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) { + log_struct(LOG_NOTICE, + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), + LOG_UNIT_ID(u), + LOG_UNIT_MESSAGE(u, "Job %s/%s failed with result '%s'.", + u->id, + job_type_to_string(t), + job_result_to_string(result))); + + unit_start_on_failure(u); + } + + unit_trigger_notify(u); + +finish: + /* Try to start the next jobs that can be started */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_AFTER]) + if (other->job) { + job_add_to_run_queue(other->job); + job_add_to_gc_queue(other->job); + } + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BEFORE]) + if (other->job) { + job_add_to_run_queue(other->job); + job_add_to_gc_queue(other->job); + } + + manager_check_finished(u->manager); + + return 0; +} + +static int job_dispatch_timer(sd_event_source *s, uint64_t monotonic, void *userdata) { + Job *j = userdata; + Unit *u; + + assert(j); + assert(s == j->timer_event_source); + + log_unit_warning(j->unit, "Job %s/%s timed out.", j->unit->id, job_type_to_string(j->type)); + + u = j->unit; + job_finish_and_invalidate(j, JOB_TIMEOUT, true, false); + + emergency_action(u->manager, u->job_timeout_action, + EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN, + u->job_timeout_reboot_arg, -1, "job timed out"); + + return 0; +} + +int job_start_timer(Job *j, bool job_running) { + int r; + usec_t timeout_time, old_timeout_time; + + if (job_running) { + j->begin_running_usec = now(CLOCK_MONOTONIC); + + if (j->unit->job_running_timeout == USEC_INFINITY) + return 0; + + timeout_time = usec_add(j->begin_running_usec, j->unit->job_running_timeout); + + if (j->timer_event_source) { + /* Update only if JobRunningTimeoutSec= results in earlier timeout */ + r = sd_event_source_get_time(j->timer_event_source, &old_timeout_time); + if (r < 0) + return r; + + if (old_timeout_time <= timeout_time) + return 0; + + return sd_event_source_set_time(j->timer_event_source, timeout_time); + } + } else { + if (j->timer_event_source) + return 0; + + j->begin_usec = now(CLOCK_MONOTONIC); + + if (j->unit->job_timeout == USEC_INFINITY) + return 0; + + timeout_time = usec_add(j->begin_usec, j->unit->job_timeout); + } + + r = sd_event_add_time( + j->manager->event, + &j->timer_event_source, + CLOCK_MONOTONIC, + timeout_time, 0, + job_dispatch_timer, j); + if (r < 0) + return r; + + (void) sd_event_source_set_description(j->timer_event_source, "job-start"); + + return 0; +} + +void job_add_to_run_queue(Job *j) { + int r; + + assert(j); + assert(j->installed); + + if (j->in_run_queue) + return; + + if (prioq_isempty(j->manager->run_queue)) { + r = sd_event_source_set_enabled(j->manager->run_queue_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_warning_errno(r, "Failed to enable job run queue event source, ignoring: %m"); + } + + r = prioq_put(j->manager->run_queue, j, &j->run_queue_idx); + if (r < 0) + log_warning_errno(r, "Failed put job in run queue, ignoring: %m"); + else + j->in_run_queue = true; +} + +void job_add_to_dbus_queue(Job *j) { + assert(j); + assert(j->installed); + + if (j->in_dbus_queue) + return; + + /* We don't check if anybody is subscribed here, since this + * job might just have been created and not yet assigned to a + * connection/client. */ + + LIST_PREPEND(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = true; +} + +char *job_dbus_path(Job *j) { + char *p; + + assert(j); + + if (asprintf(&p, "/org/freedesktop/systemd1/job/%"PRIu32, j->id) < 0) + return NULL; + + return p; +} + +int job_serialize(Job *j, FILE *f) { + assert(j); + assert(f); + + (void) serialize_item_format(f, "job-id", "%u", j->id); + (void) serialize_item(f, "job-type", job_type_to_string(j->type)); + (void) serialize_item(f, "job-state", job_state_to_string(j->state)); + (void) serialize_bool(f, "job-irreversible", j->irreversible); + (void) serialize_bool(f, "job-sent-dbus-new-signal", j->sent_dbus_new_signal); + (void) serialize_bool(f, "job-ignore-order", j->ignore_order); + + if (j->begin_usec > 0) + (void) serialize_usec(f, "job-begin", j->begin_usec); + if (j->begin_running_usec > 0) + (void) serialize_usec(f, "job-begin-running", j->begin_running_usec); + + bus_track_serialize(j->bus_track, f, "subscribed"); + + /* End marker */ + fputc('\n', f); + return 0; +} + +int job_deserialize(Job *j, FILE *f) { + int r; + + assert(j); + assert(f); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l, *v; + size_t k; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + return 0; + + l = strstrip(line); + + /* End marker */ + if (isempty(l)) + return 0; + + k = strcspn(l, "="); + + if (l[k] == '=') { + l[k] = 0; + v = l+k+1; + } else + v = l+k; + + if (streq(l, "job-id")) { + + if (safe_atou32(v, &j->id) < 0) + log_debug("Failed to parse job id value: %s", v); + + } else if (streq(l, "job-type")) { + JobType t; + + t = job_type_from_string(v); + if (t < 0) + log_debug("Failed to parse job type: %s", v); + else if (t >= _JOB_TYPE_MAX_IN_TRANSACTION) + log_debug("Cannot deserialize job of type: %s", v); + else + j->type = t; + + } else if (streq(l, "job-state")) { + JobState s; + + s = job_state_from_string(v); + if (s < 0) + log_debug("Failed to parse job state: %s", v); + else + job_set_state(j, s); + + } else if (streq(l, "job-irreversible")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job irreversible flag: %s", v); + else + j->irreversible = j->irreversible || b; + + } else if (streq(l, "job-sent-dbus-new-signal")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job sent_dbus_new_signal flag: %s", v); + else + j->sent_dbus_new_signal = j->sent_dbus_new_signal || b; + + } else if (streq(l, "job-ignore-order")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_debug("Failed to parse job ignore_order flag: %s", v); + else + j->ignore_order = j->ignore_order || b; + + } else if (streq(l, "job-begin")) + (void) deserialize_usec(v, &j->begin_usec); + + else if (streq(l, "job-begin-running")) + (void) deserialize_usec(v, &j->begin_running_usec); + + else if (streq(l, "subscribed")) { + if (strv_extend(&j->deserialized_clients, v) < 0) + return log_oom(); + } else + log_debug("Unknown job serialization key: %s", l); + } +} + +int job_coldplug(Job *j) { + int r; + usec_t timeout_time = USEC_INFINITY; + + assert(j); + + /* After deserialization is complete and the bus connection + * set up again, let's start watching our subscribers again */ + (void) bus_job_coldplug_bus_track(j); + + if (j->state == JOB_WAITING) + job_add_to_run_queue(j); + + /* Maybe due to new dependencies we don't actually need this job anymore? */ + job_add_to_gc_queue(j); + + /* Create timer only when job began or began running and the respective timeout is finite. + * Follow logic of job_start_timer() if both timeouts are finite */ + if (j->begin_usec == 0) + return 0; + + if (j->unit->job_timeout != USEC_INFINITY) + timeout_time = usec_add(j->begin_usec, j->unit->job_timeout); + + if (timestamp_is_set(j->begin_running_usec)) + timeout_time = MIN(timeout_time, usec_add(j->begin_running_usec, j->unit->job_running_timeout)); + + if (timeout_time == USEC_INFINITY) + return 0; + + j->timer_event_source = sd_event_source_unref(j->timer_event_source); + + r = sd_event_add_time( + j->manager->event, + &j->timer_event_source, + CLOCK_MONOTONIC, + timeout_time, 0, + job_dispatch_timer, j); + if (r < 0) + log_debug_errno(r, "Failed to restart timeout for job: %m"); + + (void) sd_event_source_set_description(j->timer_event_source, "job-timeout"); + + return r; +} + +void job_shutdown_magic(Job *j) { + assert(j); + + /* The shutdown target gets some special treatment here: we + * tell the kernel to begin with flushing its disk caches, to + * optimize shutdown time a bit. Ideally we wouldn't hardcode + * this magic into PID 1. However all other processes aren't + * options either since they'd exit much sooner than PID 1 and + * asynchronous sync() would cause their exit to be + * delayed. */ + + if (j->type != JOB_START) + return; + + if (!MANAGER_IS_SYSTEM(j->unit->manager)) + return; + + if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET)) + return; + + /* In case messages on console has been disabled on boot */ + j->unit->manager->no_console_output = false; + + if (detect_container() > 0) + return; + + (void) asynchronous_sync(NULL); +} + +int job_get_timeout(Job *j, usec_t *timeout) { + usec_t x = USEC_INFINITY, y = USEC_INFINITY; + Unit *u = j->unit; + int r; + + assert(u); + + if (j->timer_event_source) { + r = sd_event_source_get_time(j->timer_event_source, &x); + if (r < 0) + return r; + } + + if (UNIT_VTABLE(u)->get_timeout) { + r = UNIT_VTABLE(u)->get_timeout(u, &y); + if (r < 0) + return r; + } + + if (x == USEC_INFINITY && y == USEC_INFINITY) + return 0; + + *timeout = MIN(x, y); + return 1; +} + +bool job_may_gc(Job *j) { + Unit *other; + void *v; + + assert(j); + + /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their + * own and just track external state. For now the only unit type that qualifies for this are .device units. + * Returns true if the job can be collected. */ + + if (!UNIT_VTABLE(j->unit)->gc_jobs) + return false; + + if (sd_bus_track_count(j->bus_track) > 0) + return false; + + /* FIXME: So this is a bit ugly: for now we don't properly track references made via private bus connections + * (because it's nasty, as sd_bus_track doesn't apply to it). We simply remember that the job was once + * referenced by one, and reset this whenever we notice that no private bus connections are around. This means + * the GC is a bit too conservative when it comes to jobs created by private bus connections. */ + if (j->ref_by_private_bus) { + if (set_isempty(j->unit->manager->private_buses)) + j->ref_by_private_bus = false; + else + return false; + } + + if (j->type == JOB_NOP) + return false; + + /* The logic is inverse to job_is_runnable, we cannot GC as long as we block any job. */ + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE]) + if (other->job && job_compare(j, other->job, UNIT_BEFORE) < 0) + return false; + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER]) + if (other->job && job_compare(j, other->job, UNIT_AFTER) < 0) + return false; + + return true; +} + +void job_add_to_gc_queue(Job *j) { + assert(j); + + if (j->in_gc_queue) + return; + + if (!job_may_gc(j)) + return; + + LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j); + j->in_gc_queue = true; +} + +static int job_compare_id(Job * const *a, Job * const *b) { + return CMP((*a)->id, (*b)->id); +} + +static size_t sort_job_list(Job **list, size_t n) { + Job *previous = NULL; + size_t a, b; + + /* Order by numeric IDs */ + typesafe_qsort(list, n, job_compare_id); + + /* Filter out duplicates */ + for (a = 0, b = 0; a < n; a++) { + + if (previous == list[a]) + continue; + + previous = list[b++] = list[a]; + } + + return b; +} + +int job_get_before(Job *j, Job*** ret) { + _cleanup_free_ Job** list = NULL; + size_t n = 0, n_allocated = 0; + Unit *other = NULL; + void *v; + + /* Returns a list of all pending jobs that need to finish before this job may be started. */ + + assert(j); + assert(ret); + + if (j->ignore_order) { + *ret = NULL; + return 0; + } + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER]) { + if (!other->job) + continue; + if (job_compare(j, other->job, UNIT_AFTER) <= 0) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE]) { + if (!other->job) + continue; + if (job_compare(j, other->job, UNIT_BEFORE) <= 0) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + n = sort_job_list(list, n); + + *ret = TAKE_PTR(list); + + return (int) n; +} + +int job_get_after(Job *j, Job*** ret) { + _cleanup_free_ Job** list = NULL; + size_t n = 0, n_allocated = 0; + Unit *other = NULL; + void *v; + + assert(j); + assert(ret); + + /* Returns a list of all pending jobs that are waiting for this job to finish. */ + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_BEFORE]) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (job_compare(j, other->job, UNIT_BEFORE) >= 0) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + HASHMAP_FOREACH_KEY(v, other, j->unit->dependencies[UNIT_AFTER]) { + if (!other->job) + continue; + + if (other->job->ignore_order) + continue; + + if (job_compare(j, other->job, UNIT_AFTER) >= 0) + continue; + + if (!GREEDY_REALLOC(list, n_allocated, n+1)) + return -ENOMEM; + list[n++] = other->job; + } + + n = sort_job_list(list, n); + + *ret = TAKE_PTR(list); + + return (int) n; +} + +static const char* const job_state_table[_JOB_STATE_MAX] = { + [JOB_WAITING] = "waiting", + [JOB_RUNNING] = "running", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_state, JobState); + +static const char* const job_type_table[_JOB_TYPE_MAX] = { + [JOB_START] = "start", + [JOB_VERIFY_ACTIVE] = "verify-active", + [JOB_STOP] = "stop", + [JOB_RELOAD] = "reload", + [JOB_RELOAD_OR_START] = "reload-or-start", + [JOB_RESTART] = "restart", + [JOB_TRY_RESTART] = "try-restart", + [JOB_TRY_RELOAD] = "try-reload", + [JOB_NOP] = "nop", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_type, JobType); + +static const char* const job_mode_table[_JOB_MODE_MAX] = { + [JOB_FAIL] = "fail", + [JOB_REPLACE] = "replace", + [JOB_REPLACE_IRREVERSIBLY] = "replace-irreversibly", + [JOB_ISOLATE] = "isolate", + [JOB_FLUSH] = "flush", + [JOB_IGNORE_DEPENDENCIES] = "ignore-dependencies", + [JOB_IGNORE_REQUIREMENTS] = "ignore-requirements", + [JOB_TRIGGERING] = "triggering", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode); + +static const char* const job_result_table[_JOB_RESULT_MAX] = { + [JOB_DONE] = "done", + [JOB_CANCELED] = "canceled", + [JOB_TIMEOUT] = "timeout", + [JOB_FAILED] = "failed", + [JOB_DEPENDENCY] = "dependency", + [JOB_SKIPPED] = "skipped", + [JOB_INVALID] = "invalid", + [JOB_ASSERT] = "assert", + [JOB_UNSUPPORTED] = "unsupported", + [JOB_COLLECTED] = "collected", + [JOB_ONCE] = "once", +}; + +DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult); + +const char* job_type_to_access_method(JobType t) { + assert(t >= 0); + assert(t < _JOB_TYPE_MAX); + + if (IN_SET(t, JOB_START, JOB_RESTART, JOB_TRY_RESTART)) + return "start"; + else if (t == JOB_STOP) + return "stop"; + else + return "reload"; +} + +/* + * assume_dep assumed dependency between units (a is before/after b) + * + * Returns + * 0 jobs are independent, + * >0 a should run after b, + * <0 a should run before b, + * + * The logic means that for a service a and a service b where b.After=a: + * + * start a + start b → 1st step start a, 2nd step start b + * start a + stop b → 1st step stop b, 2nd step start a + * stop a + start b → 1st step stop a, 2nd step start b + * stop a + stop b → 1st step stop b, 2nd step stop a + * + * This has the side effect that restarts are properly + * synchronized too. + */ +int job_compare(Job *a, Job *b, UnitDependency assume_dep) { + assert(a->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(b->type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(IN_SET(assume_dep, UNIT_AFTER, UNIT_BEFORE)); + + /* Trivial cases first */ + if (a->type == JOB_NOP || b->type == JOB_NOP) + return 0; + + if (a->ignore_order || b->ignore_order) + return 0; + + if (assume_dep == UNIT_AFTER) + return -job_compare(b, a, UNIT_BEFORE); + + /* Let's make it simple, JOB_STOP goes always first (in case both ua and ub stop, + * then ub's stop goes first anyway). + * JOB_RESTART is JOB_STOP in disguise (before it is patched to JOB_START). */ + if (IN_SET(b->type, JOB_STOP, JOB_RESTART)) + return 1; + else + return -1; +} diff --git a/src/core/job.h b/src/core/job.h new file mode 100644 index 0000000..1b3ddc7 --- /dev/null +++ b/src/core/job.h @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "sd-event.h" + +#include "list.h" +#include "unit-name.h" + +typedef struct Job Job; +typedef struct JobDependency JobDependency; +typedef enum JobType JobType; +typedef enum JobState JobState; +typedef enum JobMode JobMode; +typedef enum JobResult JobResult; + +/* Be careful when changing the job types! Adjust job_merging_table[] accordingly! */ +enum JobType { + JOB_START, /* if a unit does not support being started, we'll just wait until it becomes active */ + JOB_VERIFY_ACTIVE, + + JOB_STOP, + + JOB_RELOAD, /* if running, reload */ + + /* Note that restarts are first treated like JOB_STOP, but + * then instead of finishing are patched to become + * JOB_START. */ + JOB_RESTART, /* If running, stop. Then start unconditionally. */ + + _JOB_TYPE_MAX_MERGING, + + /* JOB_NOP can enter into a transaction, but as it won't pull in + * any dependencies and it uses the special 'nop_job' slot in Unit, + * it won't have to merge with anything (except possibly into another + * JOB_NOP, previously installed). JOB_NOP is special-cased in + * job_type_is_*() functions so that the transaction can be + * activated. */ + JOB_NOP = _JOB_TYPE_MAX_MERGING, /* do nothing */ + + _JOB_TYPE_MAX_IN_TRANSACTION, + + /* JOB_TRY_RESTART can never appear in a transaction, because + * it always collapses into JOB_RESTART or JOB_NOP before entering. + * Thus we never need to merge it with anything. */ + JOB_TRY_RESTART = _JOB_TYPE_MAX_IN_TRANSACTION, /* if running, stop and then start */ + + /* Similar to JOB_TRY_RESTART but collapses to JOB_RELOAD or JOB_NOP */ + JOB_TRY_RELOAD, + + /* JOB_RELOAD_OR_START won't enter into a transaction and cannot result + * from transaction merging (there's no way for JOB_RELOAD and + * JOB_START to meet in one transaction). It can result from a merge + * during job installation, but then it will immediately collapse into + * one of the two simpler types. */ + JOB_RELOAD_OR_START, /* if running, reload, otherwise start */ + + _JOB_TYPE_MAX, + _JOB_TYPE_INVALID = -1 +}; + +enum JobState { + JOB_WAITING, + JOB_RUNNING, + _JOB_STATE_MAX, + _JOB_STATE_INVALID = -1 +}; + +enum JobMode { + JOB_FAIL, /* Fail if a conflicting job is already queued */ + JOB_REPLACE, /* Replace an existing conflicting job */ + JOB_REPLACE_IRREVERSIBLY,/* Like JOB_REPLACE + produce irreversible jobs */ + JOB_ISOLATE, /* Start a unit, and stop all others */ + JOB_FLUSH, /* Flush out all other queued jobs when queueing this one */ + JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */ + JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */ + JOB_TRIGGERING, /* Adds TRIGGERED_BY dependencies to the same transaction */ + _JOB_MODE_MAX, + _JOB_MODE_INVALID = -1 +}; + +enum JobResult { + JOB_DONE, /* Job completed successfully (or skipped due to a failed ConditionXYZ=) */ + JOB_CANCELED, /* Job canceled by a conflicting job installation or by explicit cancel request */ + JOB_TIMEOUT, /* Job timeout elapsed */ + JOB_FAILED, /* Job failed */ + JOB_DEPENDENCY, /* A required dependency job did not result in JOB_DONE */ + JOB_SKIPPED, /* Negative result of JOB_VERIFY_ACTIVE or skip due to ExecCondition= */ + JOB_INVALID, /* JOB_RELOAD of inactive unit */ + JOB_ASSERT, /* Couldn't start a unit, because an assert didn't hold */ + JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */ + JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */ + JOB_ONCE, /* Unit was started before, and hence can't be started again */ + _JOB_RESULT_MAX, + _JOB_RESULT_INVALID = -1 +}; + +#include "unit.h" + +struct JobDependency { + /* Encodes that the 'subject' job needs the 'object' job in + * some way. This structure is used only while building a transaction. */ + Job *subject; + Job *object; + + LIST_FIELDS(JobDependency, subject); + LIST_FIELDS(JobDependency, object); + + bool matters:1; + bool conflicts:1; +}; + +struct Job { + Manager *manager; + Unit *unit; + + LIST_FIELDS(Job, transaction); + LIST_FIELDS(Job, dbus_queue); + LIST_FIELDS(Job, gc_queue); + + LIST_HEAD(JobDependency, subject_list); + LIST_HEAD(JobDependency, object_list); + + /* Used for graph algs as a "I have been here" marker */ + Job* marker; + unsigned generation; + + uint32_t id; + + JobType type; + JobState state; + + sd_event_source *timer_event_source; + usec_t begin_usec; + usec_t begin_running_usec; + + /* + * This tracks where to send signals, and also which clients + * are allowed to call DBus methods on the job (other than + * root). + * + * There can be more than one client, because of job merging. + */ + sd_bus_track *bus_track; + char **deserialized_clients; + + JobResult result; + + unsigned run_queue_idx; + + bool installed:1; + bool in_run_queue:1; + bool matters_to_anchor:1; + bool in_dbus_queue:1; + bool sent_dbus_new_signal:1; + bool ignore_order:1; + bool irreversible:1; + bool in_gc_queue:1; + bool ref_by_private_bus:1; +}; + +Job* job_new(Unit *unit, JobType type); +Job* job_new_raw(Unit *unit); +void job_unlink(Job *job); +Job* job_free(Job *job); +Job* job_install(Job *j); +int job_install_deserialized(Job *j); +void job_uninstall(Job *j); +void job_dump(Job *j, FILE *f, const char *prefix); +int job_serialize(Job *j, FILE *f); +int job_deserialize(Job *j, FILE *f); +int job_coldplug(Job *j); + +JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts); +void job_dependency_free(JobDependency *l); + +int job_merge(Job *j, Job *other); + +JobType job_type_lookup_merge(JobType a, JobType b) _pure_; + +_pure_ static inline bool job_type_is_mergeable(JobType a, JobType b) { + return job_type_lookup_merge(a, b) >= 0; +} + +_pure_ static inline bool job_type_is_conflicting(JobType a, JobType b) { + return a != JOB_NOP && b != JOB_NOP && !job_type_is_mergeable(a, b); +} + +_pure_ static inline bool job_type_is_superset(JobType a, JobType b) { + /* Checks whether operation a is a "superset" of b in its actions */ + if (b == JOB_NOP) + return true; + if (a == JOB_NOP) + return false; + return a == job_type_lookup_merge(a, b); +} + +bool job_type_is_redundant(JobType a, UnitActiveState b) _pure_; + +/* Collapses a state-dependent job type into a simpler type by observing + * the state of the unit which it is going to be applied to. */ +JobType job_type_collapse(JobType t, Unit *u); + +int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u); + +void job_add_to_run_queue(Job *j); +void job_add_to_dbus_queue(Job *j); + +int job_start_timer(Job *j, bool job_running); + +int job_run_and_invalidate(Job *j); +int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already); + +char *job_dbus_path(Job *j); + +void job_shutdown_magic(Job *j); + +int job_get_timeout(Job *j, usec_t *timeout) _pure_; + +bool job_may_gc(Job *j); +void job_add_to_gc_queue(Job *j); + +int job_get_before(Job *j, Job*** ret); +int job_get_after(Job *j, Job*** ret); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Job*, job_free); + +const char* job_type_to_string(JobType t) _const_; +JobType job_type_from_string(const char *s) _pure_; + +const char* job_state_to_string(JobState t) _const_; +JobState job_state_from_string(const char *s) _pure_; + +const char* job_mode_to_string(JobMode t) _const_; +JobMode job_mode_from_string(const char *s) _pure_; + +const char* job_result_to_string(JobResult t) _const_; +JobResult job_result_from_string(const char *s) _pure_; + +const char* job_type_to_access_method(JobType t); + +int job_compare(Job *a, Job *b, UnitDependency assume_dep); diff --git a/src/core/kill.c b/src/core/kill.c new file mode 100644 index 0000000..e858ae9 --- /dev/null +++ b/src/core/kill.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "kill.h" +#include "signal-util.h" +#include "string-table.h" +#include "util.h" + +void kill_context_init(KillContext *c) { + assert(c); + + c->kill_signal = SIGTERM; + /* restart_kill_signal is unset by default and we fall back to kill_signal */ + c->final_kill_signal = SIGKILL; + c->send_sigkill = true; + c->send_sighup = false; + c->watchdog_signal = SIGABRT; +} + +void kill_context_dump(KillContext *c, FILE *f, const char *prefix) { + assert(c); + + prefix = strempty(prefix); + + fprintf(f, + "%sKillMode: %s\n" + "%sKillSignal: SIG%s\n" + "%sRestartKillSignal: SIG%s\n" + "%sFinalKillSignal: SIG%s\n" + "%sSendSIGKILL: %s\n" + "%sSendSIGHUP: %s\n", + prefix, kill_mode_to_string(c->kill_mode), + prefix, signal_to_string(c->kill_signal), + prefix, signal_to_string(restart_kill_signal(c)), + prefix, signal_to_string(c->final_kill_signal), + prefix, yes_no(c->send_sigkill), + prefix, yes_no(c->send_sighup)); +} + +static const char* const kill_mode_table[_KILL_MODE_MAX] = { + [KILL_CONTROL_GROUP] = "control-group", + [KILL_PROCESS] = "process", + [KILL_MIXED] = "mixed", + [KILL_NONE] = "none", +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_mode, KillMode); + +static const char* const kill_who_table[_KILL_WHO_MAX] = { + [KILL_MAIN] = "main", + [KILL_CONTROL] = "control", + [KILL_ALL] = "all", + [KILL_MAIN_FAIL] = "main-fail", + [KILL_CONTROL_FAIL] = "control-fail", + [KILL_ALL_FAIL] = "all-fail", +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho); diff --git a/src/core/kill.h b/src/core/kill.h new file mode 100644 index 0000000..012e433 --- /dev/null +++ b/src/core/kill.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct KillContext KillContext; + +#include <stdbool.h> +#include <stdio.h> + +#include "macro.h" + +typedef enum KillMode { + /* The kill mode is a property of a unit. */ + KILL_CONTROL_GROUP = 0, + KILL_PROCESS, + KILL_MIXED, + KILL_NONE, + _KILL_MODE_MAX, + _KILL_MODE_INVALID = -1 +} KillMode; + +struct KillContext { + KillMode kill_mode; + int kill_signal; + int restart_kill_signal; + int final_kill_signal; + int watchdog_signal; + bool send_sigkill; + bool send_sighup; +}; + +typedef enum KillWho { + /* Kill who is a property of an operation */ + KILL_MAIN, + KILL_CONTROL, + KILL_ALL, + KILL_MAIN_FAIL, + KILL_CONTROL_FAIL, + KILL_ALL_FAIL, + _KILL_WHO_MAX, + _KILL_WHO_INVALID = -1 +} KillWho; + +void kill_context_init(KillContext *c); +void kill_context_dump(KillContext *c, FILE *f, const char *prefix); + +const char *kill_mode_to_string(KillMode k) _const_; +KillMode kill_mode_from_string(const char *s) _pure_; + +const char *kill_who_to_string(KillWho k) _const_; +KillWho kill_who_from_string(const char *s) _pure_; + +static inline int restart_kill_signal(const KillContext *c) { + if (c->restart_kill_signal != 0) + return c->restart_kill_signal; + return c->kill_signal; +} diff --git a/src/core/killall.c b/src/core/killall.c new file mode 100644 index 0000000..6f60f09 --- /dev/null +++ b/src/core/killall.c @@ -0,0 +1,283 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include <errno.h> +#include <signal.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "def.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "killall.h" +#include "parse-util.h" +#include "process-util.h" +#include "set.h" +#include "string-util.h" +#include "terminal-util.h" +#include "util.h" + +static bool ignore_proc(pid_t pid, bool warn_rootfs) { + _cleanup_fclose_ FILE *f = NULL; + const char *p; + char c = 0; + uid_t uid; + int r; + + /* We are PID 1, let's not commit suicide */ + if (pid <= 1) + return true; + + /* Ignore kernel threads */ + r = is_kernel_thread(pid); + if (r != 0) + return true; /* also ignore processes where we can't determine this */ + + r = get_process_uid(pid, &uid); + if (r < 0) + return true; /* not really, but better safe than sorry */ + + /* Non-root processes otherwise are always subject to be killed */ + if (uid != 0) + return false; + + p = procfs_file_alloca(pid, "cmdline"); + f = fopen(p, "re"); + if (!f) + return true; /* not really, but has the desired effect */ + + /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for + * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as + * actual kernel threads are already filtered out above. */ + (void) fread(&c, 1, 1, f); + + /* Processes with argv[0][0] = '@' we ignore from the killing spree. + * + * https://systemd.io/ROOT_STORAGE_DAEMONS */ + if (c != '@') + return false; + + if (warn_rootfs && + pid_from_same_root_fs(pid) == 0) { + + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + log_notice("Process " PID_FMT " (%s) has been marked to be excluded from killing. It is " + "running from the root file system, and thus likely to block re-mounting of the " + "root file system to read-only. Please consider moving it into an initrd file " + "system instead.", pid, strna(comm)); + } + + return true; +} + +static void log_children_no_yet_killed(Set *pids) { + _cleanup_free_ char *lst_child = NULL; + void *p; + + SET_FOREACH(p, pids) { + _cleanup_free_ char *s = NULL; + + if (get_process_comm(PTR_TO_PID(p), &s) < 0) + (void) asprintf(&s, PID_FMT, PTR_TO_PID(p)); + + if (!strextend(&lst_child, ", ", s, NULL)) { + log_oom(); + return; + } + } + + if (isempty(lst_child)) + return; + + log_warning("Waiting for process: %s", lst_child + 2); +} + +static int wait_for_children(Set *pids, sigset_t *mask, usec_t timeout) { + usec_t until, date_log_child, n; + + assert(mask); + + /* Return the number of children remaining in the pids set: That correspond to the number + * of processes still "alive" after the timeout */ + + if (set_isempty(pids)) + return 0; + + n = now(CLOCK_MONOTONIC); + until = usec_add(n, timeout); + date_log_child = usec_add(n, 10u * USEC_PER_SEC); + if (date_log_child > until) + date_log_child = usec_add(n, timeout / 2u); + + for (;;) { + struct timespec ts; + int k; + void *p; + + /* First, let the kernel inform us about killed + * children. Most processes will probably be our + * children, but some are not (might be our + * grandchildren instead...). */ + for (;;) { + pid_t pid; + + pid = waitpid(-1, NULL, WNOHANG); + if (pid == 0) + break; + if (pid < 0) { + if (errno == ECHILD) + break; + + return log_error_errno(errno, "waitpid() failed: %m"); + } + + (void) set_remove(pids, PID_TO_PTR(pid)); + } + + /* Now explicitly check who might be remaining, who + * might not be our child. */ + SET_FOREACH(p, pids) { + + /* kill(pid, 0) sends no signal, but it tells + * us whether the process still exists. */ + if (kill(PTR_TO_PID(p), 0) == 0) + continue; + + if (errno != ESRCH) + continue; + + set_remove(pids, p); + } + + if (set_isempty(pids)) + return 0; + + n = now(CLOCK_MONOTONIC); + if (date_log_child > 0 && n >= date_log_child) { + log_children_no_yet_killed(pids); + /* Log the children not yet killed only once */ + date_log_child = 0; + } + + if (n >= until) + return set_size(pids); + + if (date_log_child > 0) + timespec_store(&ts, MIN(until - n, date_log_child - n)); + else + timespec_store(&ts, until - n); + + k = sigtimedwait(mask, NULL, &ts); + if (k != SIGCHLD) { + + if (k < 0 && errno != EAGAIN) + return log_error_errno(errno, "sigtimedwait() failed: %m"); + + if (k >= 0) + log_warning("sigtimedwait() returned unexpected signal."); + } + } +} + +static int killall(int sig, Set *pids, bool send_sighup) { + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *d; + int n_killed = 0; + + /* Send the specified signal to all remaining processes, if not excluded by ignore_proc(). + * Returns the number of processes to which the specified signal was sent */ + + dir = opendir("/proc"); + if (!dir) + return log_warning_errno(errno, "opendir(/proc) failed: %m"); + + FOREACH_DIRENT_ALL(d, dir, break) { + pid_t pid; + int r; + + if (!IN_SET(d->d_type, DT_DIR, DT_UNKNOWN)) + continue; + + if (parse_pid(d->d_name, &pid) < 0) + continue; + + if (ignore_proc(pid, sig == SIGKILL && !in_initrd())) + continue; + + if (sig == SIGKILL) { + _cleanup_free_ char *s = NULL; + + (void) get_process_comm(pid, &s); + log_notice("Sending SIGKILL to PID "PID_FMT" (%s).", pid, strna(s)); + } + + if (kill(pid, sig) >= 0) { + n_killed++; + if (pids) { + r = set_put(pids, PID_TO_PTR(pid)); + if (r < 0) + log_oom(); + } + } else if (errno != ENOENT) + log_warning_errno(errno, "Could not kill %d: %m", pid); + + if (send_sighup) { + /* Optionally, also send a SIGHUP signal, but + only if the process has a controlling + tty. This is useful to allow handling of + shells which ignore SIGTERM but react to + SIGHUP. We do not send this to processes that + have no controlling TTY since we don't want to + trigger reloads of daemon processes. Also we + make sure to only send this after SIGTERM so + that SIGTERM is always first in the queue. */ + + if (get_ctty_devnr(pid, NULL) >= 0) + /* it's OK if the process is gone, just ignore the result */ + (void) kill(pid, SIGHUP); + } + } + + return n_killed; +} + +int broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout) { + int n_children_left; + sigset_t mask, oldmask; + _cleanup_set_free_ Set *pids = NULL; + + /* Send the specified signal to all remaining processes, if not excluded by ignore_proc(). + * Return: + * - The number of processes still "alive" after the timeout (that should have been killed) + * if the function needs to wait for the end of the processes (wait_for_exit). + * - Otherwise, the number of processes to which the specified signal was sent */ + + if (wait_for_exit) + pids = set_new(NULL); + + assert_se(sigemptyset(&mask) == 0); + assert_se(sigaddset(&mask, SIGCHLD) == 0); + assert_se(sigprocmask(SIG_BLOCK, &mask, &oldmask) == 0); + + if (kill(-1, SIGSTOP) < 0 && errno != ESRCH) + log_warning_errno(errno, "kill(-1, SIGSTOP) failed: %m"); + + n_children_left = killall(sig, pids, send_sighup); + + if (kill(-1, SIGCONT) < 0 && errno != ESRCH) + log_warning_errno(errno, "kill(-1, SIGCONT) failed: %m"); + + if (wait_for_exit && n_children_left > 0) + n_children_left = wait_for_children(pids, &mask, timeout); + + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0); + + return n_children_left; +} diff --git a/src/core/killall.h b/src/core/killall.h new file mode 100644 index 0000000..d8ef96f --- /dev/null +++ b/src/core/killall.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +int broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout); diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c new file mode 100644 index 0000000..8a7f828 --- /dev/null +++ b/src/core/kmod-setup.c @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <ftw.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bus-util.h" +#include "capability-util.h" +#include "fileio.h" +#include "kmod-setup.h" +#include "macro.h" +#include "string-util.h" + +#if HAVE_KMOD +#include "module-util.h" + +static void systemd_kmod_log( + void *data, + int priority, + const char *file, int line, + const char *fn, + const char *format, + va_list args) { + + /* library logging is enabled at debug only */ + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_DEBUG, 0, file, line, fn, format, args); + REENABLE_WARNING; +} + +static int has_virtio_rng_nftw_cb( + const char *fpath, + const struct stat *sb, + int tflag, + struct FTW *ftwbuf) { + + _cleanup_free_ char *alias = NULL; + int r; + + if ((FTW_D == tflag) && (ftwbuf->level > 2)) + return FTW_SKIP_SUBTREE; + + if (FTW_F != tflag) + return FTW_CONTINUE; + + if (!endswith(fpath, "/modalias")) + return FTW_CONTINUE; + + r = read_one_line_file(fpath, &alias); + if (r < 0) + return FTW_SKIP_SIBLINGS; + + if (startswith(alias, "pci:v00001AF4d00001005")) + return FTW_STOP; + + if (startswith(alias, "pci:v00001AF4d00001044")) + return FTW_STOP; + + return FTW_SKIP_SIBLINGS; +} + +static bool has_virtio_rng(void) { + return (nftw("/sys/devices/pci0000:00", has_virtio_rng_nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL) == FTW_STOP); +} +#endif + +int kmod_setup(void) { +#if HAVE_KMOD + + static const struct { + const char *module; + const char *path; + bool warn_if_unavailable:1; + bool warn_if_module:1; + bool (*condition_fn)(void); + } kmod_table[] = { + /* This one we need to load explicitly, since auto-loading on use doesn't work + * before udev created the ghost device nodes, and we need it earlier than that. */ + { "autofs4", "/sys/class/misc/autofs", true, false, NULL }, + + /* This one we need to load explicitly, since auto-loading of IPv6 is not done when + * we try to configure ::1 on the loopback device. */ + { "ipv6", "/sys/module/ipv6", false, true, NULL }, + + /* This should never be a module */ + { "unix", "/proc/net/unix", true, true, NULL }, + +#if HAVE_LIBIPTC + /* netfilter is needed by networkd, nspawn among others, and cannot be autoloaded */ + { "ip_tables", "/proc/net/ip_tables_names", false, false, NULL }, +#endif + /* virtio_rng would be loaded by udev later, but real entropy might be needed very early */ + { "virtio_rng", NULL, false, false, has_virtio_rng }, + }; + _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL; + unsigned i; + + if (have_effective_cap(CAP_SYS_MODULE) == 0) + return 0; + + for (i = 0; i < ELEMENTSOF(kmod_table); i++) { + if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0) + continue; + + if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn()) + continue; + + if (kmod_table[i].warn_if_module) + log_debug("Your kernel apparently lacks built-in %s support. Might be " + "a good idea to compile it in. We'll now try to work around " + "this by loading the module...", kmod_table[i].module); + + if (!ctx) { + ctx = kmod_new(NULL, NULL); + if (!ctx) + return log_oom(); + + kmod_set_log_fn(ctx, systemd_kmod_log, NULL); + kmod_load_resources(ctx); + } + + (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable); + } + +#endif + return 0; +} diff --git a/src/core/kmod-setup.h b/src/core/kmod-setup.h new file mode 100644 index 0000000..1c842d3 --- /dev/null +++ b/src/core/kmod-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int kmod_setup(void); diff --git a/src/core/load-dropin.c b/src/core/load-dropin.c new file mode 100644 index 0000000..d1c85e2 --- /dev/null +++ b/src/core/load-dropin.c @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "fs-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static int process_deps(Unit *u, UnitDependency dependency, const char *dir_suffix) { + _cleanup_strv_free_ char **paths = NULL; + char **p; + int r; + + r = unit_file_find_dropin_paths(NULL, + u->manager->lookup_paths.search_path, + u->manager->unit_path_cache, + dir_suffix, NULL, + u->id, u->aliases, + &paths); + if (r < 0) + return r; + + STRV_FOREACH(p, paths) { + _cleanup_free_ char *target = NULL; + const char *entry; + + entry = basename(*p); + + if (null_or_empty_path(*p) > 0) { + /* an error usually means an invalid symlink, which is not a mask */ + log_unit_debug(u, "%s dependency on %s is masked by %s, ignoring.", + unit_dependency_to_string(dependency), entry, *p); + continue; + } + + r = is_symlink(*p); + if (r < 0) { + log_unit_warning_errno(u, r, "%s dropin %s unreadable, ignoring: %m", + unit_dependency_to_string(dependency), *p); + continue; + } + if (r == 0) { + log_unit_warning(u, "%s dependency dropin %s is not a symlink, ignoring.", + unit_dependency_to_string(dependency), *p); + continue; + } + + if (!unit_name_is_valid(entry, UNIT_NAME_ANY)) { + log_unit_warning(u, "%s dependency dropin %s is not a valid unit name, ignoring.", + unit_dependency_to_string(dependency), *p); + continue; + } + + r = readlink_malloc(*p, &target); + if (r < 0) { + log_unit_warning_errno(u, r, "readlink(\"%s\") failed, ignoring: %m", *p); + continue; + } + + /* We don't treat this as an error, especially because we didn't check this for a + * long time. Nevertheless, we warn, because such mismatch can be mighty confusing. */ + r = unit_symlink_name_compatible(entry, basename(target), u->instance); + if (r < 0) { + log_unit_warning_errno(u, r, "Can't check if names %s and %s are compatible, ignoring: %m", + entry, basename(target)); + continue; + } + if (r == 0) + log_unit_warning(u, "%s dependency dropin %s target %s has different name", + unit_dependency_to_string(dependency), *p, target); + + r = unit_add_dependency_by_name(u, dependency, entry, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add %s dependency on %s, ignoring: %m", + unit_dependency_to_string(dependency), entry); + } + + return 0; +} + +int unit_load_dropin(Unit *u) { + _cleanup_strv_free_ char **l = NULL; + char **f; + int r; + + assert(u); + + /* Load dependencies from .wants and .requires directories */ + r = process_deps(u, UNIT_WANTS, ".wants"); + if (r < 0) + return r; + + r = process_deps(u, UNIT_REQUIRES, ".requires"); + if (r < 0) + return r; + + /* Load .conf dropins */ + r = unit_find_dropin_paths(u, &l); + if (r <= 0) + return 0; + + if (!u->dropin_paths) + u->dropin_paths = TAKE_PTR(l); + else { + r = strv_extend_strv(&u->dropin_paths, l, true); + if (r < 0) + return log_oom(); + } + + STRV_FOREACH(f, u->dropin_paths) + (void) config_parse( + u->id, *f, NULL, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + 0, + u, + &u->dropin_mtime); + + return 0; +} diff --git a/src/core/load-dropin.h b/src/core/load-dropin.h new file mode 100644 index 0000000..f0b87d3 --- /dev/null +++ b/src/core/load-dropin.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "dropin.h" +#include "unit.h" + +/* Read service data supplementary drop-in directories */ + +static inline int unit_find_dropin_paths(Unit *u, char ***paths) { + assert(u); + + return unit_file_find_dropin_paths(NULL, + u->manager->lookup_paths.search_path, + u->manager->unit_path_cache, + ".d", ".conf", + u->id, u->aliases, + paths); +} + +int unit_load_dropin(Unit *u); diff --git a/src/core/load-fragment-gperf-nulstr.awk b/src/core/load-fragment-gperf-nulstr.awk new file mode 100644 index 0000000..44bc1fb --- /dev/null +++ b/src/core/load-fragment-gperf-nulstr.awk @@ -0,0 +1,14 @@ +BEGIN{ + keywords=0 ; FS="," ; + print "extern const char load_fragment_gperf_nulstr[];" ; + print "const char load_fragment_gperf_nulstr[] =" +} +keyword==1 { + print "\"" $1 "\\0\"" +} +/%%/ { + keyword=1 +} +END { + print ";" +} diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 new file mode 100644 index 0000000..946862c --- /dev/null +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -0,0 +1,509 @@ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include <stddef.h> +#include "all-units.h" +#include "conf-parser.h" +#include "load-fragment.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name load_fragment_gperf_hash +%define lookup-function-name load_fragment_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +m4_dnl Define the context options only once +m4_define(`EXEC_CONTEXT_CONFIG_ITEMS', +`$1.WorkingDirectory, config_parse_working_directory, 0, offsetof($1, exec_context) +$1.RootDirectory, config_parse_unit_path_printf, true, offsetof($1, exec_context.root_directory) +$1.RootImage, config_parse_unit_path_printf, true, offsetof($1, exec_context.root_image) +$1.RootImageOptions, config_parse_root_image_options, 0, offsetof($1, exec_context) +$1.RootHash, config_parse_exec_root_hash, 0, offsetof($1, exec_context) +$1.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof($1, exec_context) +$1.RootVerity, config_parse_unit_path_printf, true, offsetof($1, exec_context.root_verity) +$1.MountImages, config_parse_mount_images, 0, offsetof($1, exec_context) +$1.User, config_parse_user_group_compat, 0, offsetof($1, exec_context.user) +$1.Group, config_parse_user_group_compat, 0, offsetof($1, exec_context.group) +$1.SupplementaryGroups, config_parse_user_group_strv_compat, 0, offsetof($1, exec_context.supplementary_groups) +$1.Nice, config_parse_exec_nice, 0, offsetof($1, exec_context) +$1.OOMScoreAdjust, config_parse_exec_oom_score_adjust, 0, offsetof($1, exec_context) +$1.CoredumpFilter, config_parse_exec_coredump_filter, 0, offsetof($1, exec_context) +$1.IOSchedulingClass, config_parse_exec_io_class, 0, offsetof($1, exec_context) +$1.IOSchedulingPriority, config_parse_exec_io_priority, 0, offsetof($1, exec_context) +$1.CPUSchedulingPolicy, config_parse_exec_cpu_sched_policy, 0, offsetof($1, exec_context) +$1.CPUSchedulingPriority, config_parse_exec_cpu_sched_prio, 0, offsetof($1, exec_context) +$1.CPUSchedulingResetOnFork, config_parse_bool, 0, offsetof($1, exec_context.cpu_sched_reset_on_fork) +$1.CPUAffinity, config_parse_exec_cpu_affinity, 0, offsetof($1, exec_context) +$1.NUMAPolicy, config_parse_numa_policy, 0, offsetof($1, exec_context.numa_policy.type) +$1.NUMAMask, config_parse_numa_mask, 0, offsetof($1, exec_context.numa_policy) +$1.UMask, config_parse_mode, 0, offsetof($1, exec_context.umask) +$1.Environment, config_parse_environ, 0, offsetof($1, exec_context.environment) +$1.EnvironmentFile, config_parse_unit_env_file, 0, offsetof($1, exec_context.environment_files) +$1.PassEnvironment, config_parse_pass_environ, 0, offsetof($1, exec_context.pass_environment) +$1.UnsetEnvironment, config_parse_unset_environ, 0, offsetof($1, exec_context.unset_environment) +$1.DynamicUser, config_parse_bool, true, offsetof($1, exec_context.dynamic_user) +$1.RemoveIPC, config_parse_bool, 0, offsetof($1, exec_context.remove_ipc) +$1.StandardInput, config_parse_exec_input, 0, offsetof($1, exec_context) +$1.StandardOutput, config_parse_exec_output, 0, offsetof($1, exec_context) +$1.StandardError, config_parse_exec_output, 0, offsetof($1, exec_context) +$1.StandardInputText, config_parse_exec_input_text, 0, offsetof($1, exec_context) +$1.StandardInputData, config_parse_exec_input_data, 0, offsetof($1, exec_context) +$1.TTYPath, config_parse_unit_path_printf, 0, offsetof($1, exec_context.tty_path) +$1.TTYReset, config_parse_bool, 0, offsetof($1, exec_context.tty_reset) +$1.TTYVHangup, config_parse_bool, 0, offsetof($1, exec_context.tty_vhangup) +$1.TTYVTDisallocate, config_parse_bool, 0, offsetof($1, exec_context.tty_vt_disallocate) +$1.SyslogIdentifier, config_parse_unit_string_printf, 0, offsetof($1, exec_context.syslog_identifier) +$1.SyslogFacility, config_parse_log_facility, 0, offsetof($1, exec_context.syslog_priority) +$1.SyslogLevel, config_parse_log_level, 0, offsetof($1, exec_context.syslog_priority) +$1.SyslogLevelPrefix, config_parse_bool, 0, offsetof($1, exec_context.syslog_level_prefix) +$1.LogLevelMax, config_parse_log_level, 0, offsetof($1, exec_context.log_level_max) +$1.LogRateLimitIntervalSec, config_parse_sec, 0, offsetof($1, exec_context.log_ratelimit_interval_usec) +$1.LogRateLimitBurst, config_parse_unsigned, 0, offsetof($1, exec_context.log_ratelimit_burst) +$1.LogExtraFields, config_parse_log_extra_fields, 0, offsetof($1, exec_context) +$1.Capabilities, config_parse_warn_compat, DISABLED_LEGACY, offsetof($1, exec_context) +$1.SecureBits, config_parse_exec_secure_bits, 0, offsetof($1, exec_context.secure_bits) +$1.CapabilityBoundingSet, config_parse_capability_set, 0, offsetof($1, exec_context.capability_bounding_set) +$1.AmbientCapabilities, config_parse_capability_set, 0, offsetof($1, exec_context.capability_ambient_set) +$1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec) +$1.NoNewPrivileges, config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges) +$1.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof($1, exec_context.keyring_mode) +$1.ProtectProc, config_parse_protect_proc, 0, offsetof($1, exec_context.protect_proc) +$1.ProcSubset, config_parse_proc_subset, 0, offsetof($1, exec_context.proc_subset) +m4_ifdef(`HAVE_SECCOMP', +`$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context) +$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) +$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context) +$1.SystemCallLog, config_parse_syscall_log, 0, offsetof($1, exec_context) +$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) +$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context) +$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) +$1.RestrictSUIDSGID, config_parse_bool, 0, offsetof($1, exec_context.restrict_suid_sgid) +$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context) +$1.LockPersonality, config_parse_bool, 0, offsetof($1, exec_context.lock_personality)', +`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.SystemCallLog, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictSUIDSGID, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +$1.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof($1, exec_context.rlimit) +$1.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit) +$1.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof($1, exec_context.rlimit) +$1.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof($1, exec_context.rlimit) +$1.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof($1, exec_context.rlimit) +$1.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof($1, exec_context.rlimit) +$1.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof($1, exec_context.rlimit) +$1.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof($1, exec_context.rlimit) +$1.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof($1, exec_context.rlimit) +$1.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof($1, exec_context.rlimit) +$1.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof($1, exec_context.rlimit) +$1.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof($1, exec_context.rlimit) +$1.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof($1, exec_context.rlimit) +$1.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof($1, exec_context.rlimit) +$1.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof($1, exec_context.rlimit) +$1.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof($1, exec_context.rlimit) +$1.ReadWriteDirectories, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_write_paths) +$1.ReadOnlyDirectories, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths) +$1.InaccessibleDirectories, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) +$1.ReadWritePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_write_paths) +$1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths) +$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) +$1.BindPaths, config_parse_bind_paths, 0, offsetof($1, exec_context) +$1.BindReadOnlyPaths, config_parse_bind_paths, 0, offsetof($1, exec_context) +$1.TemporaryFileSystem, config_parse_temporary_filesystems, 0, offsetof($1, exec_context) +$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) +$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) +$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) +$1.ProtectKernelLogs, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_logs) +$1.ProtectClock, config_parse_bool, 0, offsetof($1, exec_context.protect_clock) +$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) +$1.NetworkNamespacePath, config_parse_unit_path_printf, 0, offsetof($1, exec_context.network_namespace_path) +$1.LogNamespace, config_parse_log_namespace, 0, offsetof($1, exec_context) +$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) +$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) +$1.PrivateMounts, config_parse_bool, 0, offsetof($1, exec_context.private_mounts) +$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context.protect_system) +$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context.protect_home) +$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context.mount_flags) +$1.MountAPIVFS, config_parse_bool, 0, offsetof($1, exec_context.mount_apivfs) +$1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality) +$1.RuntimeDirectoryPreserve, config_parse_runtime_preserve_mode, 0, offsetof($1, exec_context.runtime_directory_preserve_mode) +$1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode) +$1.RuntimeDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_RUNTIME].paths) +$1.StateDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_STATE].mode) +$1.StateDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_STATE].paths) +$1.CacheDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CACHE].mode) +$1.CacheDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CACHE].paths) +$1.LogsDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_LOGS].mode) +$1.LogsDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_LOGS].paths) +$1.ConfigurationDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].mode) +$1.ConfigurationDirectory, config_parse_exec_directories, 0, offsetof($1, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].paths) +$1.SetCredential, config_parse_set_credential, 0, offsetof($1, exec_context) +$1.LoadCredential, config_parse_load_credential, 0, offsetof($1, exec_context) +$1.TimeoutCleanSec, config_parse_sec, 0, offsetof($1, exec_context.timeout_clean_usec) +$1.ProtectHostname, config_parse_bool, 0, offsetof($1, exec_context.protect_hostname) +m4_ifdef(`HAVE_PAM', +`$1.PAMName, config_parse_unit_string_printf, 0, offsetof($1, exec_context.pam_name)', +`$1.PAMName, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +$1.IgnoreSIGPIPE, config_parse_bool, 0, offsetof($1, exec_context.ignore_sigpipe) +$1.UtmpIdentifier, config_parse_unit_string_printf, 0, offsetof($1, exec_context.utmp_id) +$1.UtmpMode, config_parse_exec_utmp_mode, 0, offsetof($1, exec_context.utmp_mode) +m4_ifdef(`HAVE_SELINUX', +`$1.SELinuxContext, config_parse_exec_selinux_context, 0, offsetof($1, exec_context)', +`$1.SELinuxContext, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +m4_ifdef(`HAVE_APPARMOR', +`$1.AppArmorProfile, config_parse_exec_apparmor_profile, 0, offsetof($1, exec_context)', +`$1.AppArmorProfile, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +m4_ifdef(`ENABLE_SMACK', +`$1.SmackProcessLabel, config_parse_exec_smack_process_label, 0, offsetof($1, exec_context)', +`$1.SmackProcessLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')' +)m4_dnl +m4_define(`KILL_CONTEXT_CONFIG_ITEMS', +`$1.SendSIGKILL, config_parse_bool, 0, offsetof($1, kill_context.send_sigkill) +$1.SendSIGHUP, config_parse_bool, 0, offsetof($1, kill_context.send_sighup) +$1.KillMode, config_parse_kill_mode, 0, offsetof($1, kill_context.kill_mode) +$1.KillSignal, config_parse_signal, 0, offsetof($1, kill_context.kill_signal) +$1.RestartKillSignal, config_parse_signal, 0, offsetof($1, kill_context.restart_kill_signal) +$1.FinalKillSignal, config_parse_signal, 0, offsetof($1, kill_context.final_kill_signal) +$1.WatchdogSignal, config_parse_signal, 0, offsetof($1, kill_context.watchdog_signal)' +)m4_dnl +m4_define(`CGROUP_CONTEXT_CONFIG_ITEMS', +`$1.Slice, config_parse_unit_slice, 0, 0 +$1.AllowedCPUs, config_parse_allowed_cpus, 0, offsetof($1, cgroup_context) +$1.AllowedMemoryNodes, config_parse_allowed_mems, 0, offsetof($1, cgroup_context) +$1.CPUAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.cpu_accounting) +$1.CPUWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.cpu_weight) +$1.StartupCPUWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.startup_cpu_weight) +$1.CPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.cpu_shares) +$1.StartupCPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.startup_cpu_shares) +$1.CPUQuota, config_parse_cpu_quota, 0, offsetof($1, cgroup_context) +$1.CPUQuotaPeriodSec, config_parse_sec_def_infinity, 0, offsetof($1, cgroup_context.cpu_quota_period_usec) +$1.MemoryAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.memory_accounting) +$1.MemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.DefaultMemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.DefaultMemoryLow, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryLow, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryHigh, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryMax, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemorySwapMax, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.MemoryLimit, config_parse_memory_limit, 0, offsetof($1, cgroup_context) +$1.DeviceAllow, config_parse_device_allow, 0, offsetof($1, cgroup_context) +$1.DevicePolicy, config_parse_device_policy, 0, offsetof($1, cgroup_context.device_policy) +$1.IOAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.io_accounting) +$1.IOWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.io_weight) +$1.StartupIOWeight, config_parse_cg_weight, 0, offsetof($1, cgroup_context.startup_io_weight) +$1.IODeviceWeight, config_parse_io_device_weight, 0, offsetof($1, cgroup_context) +$1.IOReadBandwidthMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IOWriteBandwidthMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IOReadIOPSMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IOWriteIOPSMax, config_parse_io_limit, 0, offsetof($1, cgroup_context) +$1.IODeviceLatencyTargetSec, config_parse_io_device_latency, 0, offsetof($1, cgroup_context) +$1.BlockIOAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.blockio_accounting) +$1.BlockIOWeight, config_parse_blockio_weight, 0, offsetof($1, cgroup_context.blockio_weight) +$1.StartupBlockIOWeight, config_parse_blockio_weight, 0, offsetof($1, cgroup_context.startup_blockio_weight) +$1.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof($1, cgroup_context) +$1.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) +$1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) +$1.TasksAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.tasks_accounting) +$1.TasksMax, config_parse_tasks_max, 0, offsetof($1, cgroup_context.tasks_max) +$1.Delegate, config_parse_delegate, 0, offsetof($1, cgroup_context) +$1.DisableControllers, config_parse_disable_controllers, 0, offsetof($1, cgroup_context) +$1.IPAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.ip_accounting) +$1.IPAddressAllow, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_allow) +$1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny) +$1.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_ingress) +$1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_egress) +$1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap) +$1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) +$1.ManagedOOMMemoryPressureLimitPercent, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit) +$1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' +)m4_dnl +Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation) +Unit.SourcePath, config_parse_unit_path_printf, 0, offsetof(Unit, source_path) +Unit.Requires, config_parse_unit_deps, UNIT_REQUIRES, 0 +Unit.Requisite, config_parse_unit_deps, UNIT_REQUISITE, 0 +Unit.Wants, config_parse_unit_deps, UNIT_WANTS, 0 +Unit.BindsTo, config_parse_unit_deps, UNIT_BINDS_TO, 0 +Unit.BindTo, config_parse_unit_deps, UNIT_BINDS_TO, 0 +Unit.Conflicts, config_parse_unit_deps, UNIT_CONFLICTS, 0 +Unit.Before, config_parse_unit_deps, UNIT_BEFORE, 0 +Unit.After, config_parse_unit_deps, UNIT_AFTER, 0 +Unit.OnFailure, config_parse_unit_deps, UNIT_ON_FAILURE, 0 +Unit.PropagatesReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0 +Unit.PropagateReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0 +Unit.ReloadPropagatedFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0 +Unit.PropagateReloadFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0 +Unit.PartOf, config_parse_unit_deps, UNIT_PART_OF, 0 +Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0 +Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0 +Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0 +Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0 +Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded) +Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start) +Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop) +Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate) +Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies) +Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode) +m4_dnl The following is a legacy alias name for compatibility +Unit.OnFailureIsolate, config_parse_job_mode_isolate, 0, offsetof(Unit, on_failure_job_mode) +Unit.IgnoreOnIsolate, config_parse_bool, 0, offsetof(Unit, ignore_on_isolate) +Unit.IgnoreOnSnapshot, config_parse_warn_compat, DISABLED_LEGACY, 0 +Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0 +Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0 +Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action) +Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg) +Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) +m4_dnl The following is a legacy alias name for compatibility +Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) +Unit.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst) +Unit.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action) +Unit.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action) +Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action) +Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status) +Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status) +Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions) +Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions) +Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions) +Unit.ConditionPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, conditions) +Unit.ConditionPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, conditions) +Unit.ConditionPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, conditions) +Unit.ConditionPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, conditions) +Unit.ConditionDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, conditions) +Unit.ConditionFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, conditions) +Unit.ConditionFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, conditions) +Unit.ConditionNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, conditions) +Unit.ConditionFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, conditions) +Unit.ConditionArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, conditions) +Unit.ConditionVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, conditions) +Unit.ConditionHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, conditions) +Unit.ConditionKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, conditions) +Unit.ConditionKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, conditions) +Unit.ConditionSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, conditions) +Unit.ConditionCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, conditions) +Unit.ConditionACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, conditions) +Unit.ConditionMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, conditions) +Unit.ConditionCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, conditions) +Unit.ConditionEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, conditions) +Unit.ConditionUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, conditions) +Unit.ConditionGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, conditions) +Unit.ConditionControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, conditions) +Unit.AssertPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, asserts) +Unit.AssertPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, asserts) +Unit.AssertPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, asserts) +Unit.AssertPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, asserts) +Unit.AssertPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, asserts) +Unit.AssertPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, asserts) +Unit.AssertPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, asserts) +Unit.AssertDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, asserts) +Unit.AssertFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, asserts) +Unit.AssertFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, asserts) +Unit.AssertNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, asserts) +Unit.AssertFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, asserts) +Unit.AssertArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, asserts) +Unit.AssertVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, asserts) +Unit.AssertHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, asserts) +Unit.AssertKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, asserts) +Unit.AssertKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, asserts) +Unit.AssertSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, asserts) +Unit.AssertCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, asserts) +Unit.AssertACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, asserts) +Unit.AssertMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, asserts) +Unit.AssertCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, asserts) +Unit.AssertEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, asserts) +Unit.AssertUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, asserts) +Unit.AssertGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, asserts) +Unit.AssertControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, asserts) +Unit.CollectMode, config_parse_collect_mode, 0, offsetof(Unit, collect_mode) +m4_dnl +Service.PIDFile, config_parse_pid_file, 0, offsetof(Service, pid_file) +Service.ExecCondition, config_parse_exec, SERVICE_EXEC_CONDITION, offsetof(Service, exec_command) +Service.ExecStartPre, config_parse_exec, SERVICE_EXEC_START_PRE, offsetof(Service, exec_command) +Service.ExecStart, config_parse_exec, SERVICE_EXEC_START, offsetof(Service, exec_command) +Service.ExecStartPost, config_parse_exec, SERVICE_EXEC_START_POST, offsetof(Service, exec_command) +Service.ExecReload, config_parse_exec, SERVICE_EXEC_RELOAD, offsetof(Service, exec_command) +Service.ExecStop, config_parse_exec, SERVICE_EXEC_STOP, offsetof(Service, exec_command) +Service.ExecStopPost, config_parse_exec, SERVICE_EXEC_STOP_POST, offsetof(Service, exec_command) +Service.RestartSec, config_parse_sec, 0, offsetof(Service, restart_usec) +Service.TimeoutSec, config_parse_service_timeout, 0, 0 +Service.TimeoutStartSec, config_parse_service_timeout, 0, 0 +Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec) +Service.TimeoutAbortSec, config_parse_service_timeout_abort, 0, 0 +Service.TimeoutStartFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_start_failure_mode) +Service.TimeoutStopFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_stop_failure_mode) +Service.RuntimeMaxSec, config_parse_sec, 0, offsetof(Service, runtime_max_usec) +Service.WatchdogSec, config_parse_sec, 0, offsetof(Service, watchdog_usec) +m4_dnl The following five only exist for compatibility, they moved into Unit, see above +Service.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) +Service.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst) +Service.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action) +Service.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action) +Service.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Service.Type, config_parse_service_type, 0, offsetof(Service, type) +Service.Restart, config_parse_service_restart, 0, offsetof(Service, restart) +Service.PermissionsStartOnly, config_parse_bool, 0, offsetof(Service, permissions_start_only) +Service.RootDirectoryStartOnly, config_parse_bool, 0, offsetof(Service, root_directory_start_only) +Service.RemainAfterExit, config_parse_bool, 0, offsetof(Service, remain_after_exit) +Service.GuessMainPID, config_parse_bool, 0, offsetof(Service, guess_main_pid) +Service.RestartPreventExitStatus, config_parse_set_status, 0, offsetof(Service, restart_prevent_status) +Service.RestartForceExitStatus, config_parse_set_status, 0, offsetof(Service, restart_force_status) +Service.SuccessExitStatus, config_parse_set_status, 0, offsetof(Service, success_status) +Service.SysVStartPriority, config_parse_warn_compat, DISABLED_LEGACY, 0 +Service.NonBlocking, config_parse_bool, 0, offsetof(Service, exec_context.non_blocking) +Service.BusName, config_parse_bus_name, 0, offsetof(Service, bus_name) +Service.FileDescriptorStoreMax, config_parse_unsigned, 0, offsetof(Service, n_fd_store_max) +Service.NotifyAccess, config_parse_notify_access, 0, offsetof(Service, notify_access) +Service.Sockets, config_parse_service_sockets, 0, 0 +Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0 +Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors) +Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings) +Service.OOMPolicy, config_parse_oom_policy, 0, offsetof(Service, oom_policy) +EXEC_CONTEXT_CONFIG_ITEMS(Service)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Service)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Service)m4_dnl +m4_dnl +Socket.ListenStream, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenDatagram, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenSequentialPacket, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenFIFO, config_parse_socket_listen, SOCKET_FIFO, 0 +Socket.ListenNetlink, config_parse_socket_listen, SOCKET_SOCKET, 0 +Socket.ListenSpecial, config_parse_socket_listen, SOCKET_SPECIAL, 0 +Socket.ListenMessageQueue, config_parse_socket_listen, SOCKET_MQUEUE, 0 +Socket.ListenUSBFunction, config_parse_socket_listen, SOCKET_USB_FUNCTION, 0 +Socket.SocketProtocol, config_parse_socket_protocol, 0, offsetof(Socket, socket_protocol) +Socket.BindIPv6Only, config_parse_socket_bind, 0, offsetof(Socket, bind_ipv6_only) +Socket.Backlog, config_parse_unsigned, 0, offsetof(Socket, backlog) +Socket.BindToDevice, config_parse_socket_bindtodevice, 0, 0 +Socket.ExecStartPre, config_parse_exec, SOCKET_EXEC_START_PRE, offsetof(Socket, exec_command) +Socket.ExecStartPost, config_parse_exec, SOCKET_EXEC_START_POST, offsetof(Socket, exec_command) +Socket.ExecStopPre, config_parse_exec, SOCKET_EXEC_STOP_PRE, offsetof(Socket, exec_command) +Socket.ExecStopPost, config_parse_exec, SOCKET_EXEC_STOP_POST, offsetof(Socket, exec_command) +Socket.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Socket, timeout_usec) +Socket.SocketUser, config_parse_user_group_compat, 0, offsetof(Socket, user) +Socket.SocketGroup, config_parse_user_group_compat, 0, offsetof(Socket, group) +Socket.SocketMode, config_parse_mode, 0, offsetof(Socket, socket_mode) +Socket.DirectoryMode, config_parse_mode, 0, offsetof(Socket, directory_mode) +Socket.Accept, config_parse_bool, 0, offsetof(Socket, accept) +Socket.FlushPending, config_parse_bool, 0, offsetof(Socket, flush_pending) +Socket.Writable, config_parse_bool, 0, offsetof(Socket, writable) +Socket.MaxConnections, config_parse_unsigned, 0, offsetof(Socket, max_connections) +Socket.MaxConnectionsPerSource, config_parse_unsigned, 0, offsetof(Socket, max_connections_per_source) +Socket.KeepAlive, config_parse_bool, 0, offsetof(Socket, keep_alive) +Socket.KeepAliveTimeSec, config_parse_sec, 0, offsetof(Socket, keep_alive_time) +Socket.KeepAliveIntervalSec, config_parse_sec, 0, offsetof(Socket, keep_alive_interval) +Socket.KeepAliveProbes, config_parse_unsigned, 0, offsetof(Socket, keep_alive_cnt) +Socket.DeferAcceptSec, config_parse_sec, 0, offsetof(Socket, defer_accept) +Socket.NoDelay, config_parse_bool, 0, offsetof(Socket, no_delay) +Socket.Priority, config_parse_int, 0, offsetof(Socket, priority) +Socket.ReceiveBuffer, config_parse_iec_size, 0, offsetof(Socket, receive_buffer) +Socket.SendBuffer, config_parse_iec_size, 0, offsetof(Socket, send_buffer) +Socket.IPTOS, config_parse_ip_tos, 0, offsetof(Socket, ip_tos) +Socket.IPTTL, config_parse_int, 0, offsetof(Socket, ip_ttl) +Socket.Mark, config_parse_int, 0, offsetof(Socket, mark) +Socket.PipeSize, config_parse_iec_size, 0, offsetof(Socket, pipe_size) +Socket.FreeBind, config_parse_bool, 0, offsetof(Socket, free_bind) +Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent) +Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast) +Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred) +Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec) +Socket.PassPacketInfo, config_parse_bool, 0, offsetof(Socket, pass_pktinfo) +Socket.Timestamping, config_parse_socket_timestamping, 0, offsetof(Socket, timestamping) +Socket.TCPCongestion, config_parse_string, 0, offsetof(Socket, tcp_congestion) +Socket.ReusePort, config_parse_bool, 0, offsetof(Socket, reuse_port) +Socket.MessageQueueMaxMessages, config_parse_long, 0, offsetof(Socket, mq_maxmsg) +Socket.MessageQueueMessageSize, config_parse_long, 0, offsetof(Socket, mq_msgsize) +Socket.RemoveOnStop, config_parse_bool, 0, offsetof(Socket, remove_on_stop) +Socket.Symlinks, config_parse_unit_path_strv_printf, 0, offsetof(Socket, symlinks) +Socket.FileDescriptorName, config_parse_fdname, 0, 0 +Socket.Service, config_parse_socket_service, 0, 0 +Socket.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, trigger_limit.interval) +Socket.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Socket, trigger_limit.burst) +m4_ifdef(`ENABLE_SMACK', +`Socket.SmackLabel, config_parse_unit_string_printf, 0, offsetof(Socket, smack) +Socket.SmackLabelIPIn, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_in) +Socket.SmackLabelIPOut, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_out)', +`Socket.SmackLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +Socket.SmackLabelIPIn, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +Socket.SmackLabelIPOut, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +m4_ifdef(`HAVE_SELINUX', +`Socket.SELinuxContextFromNet, config_parse_bool, 0, offsetof(Socket, selinux_context_from_net)', +`Socket.SELinuxContextFromNet, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +EXEC_CONTEXT_CONFIG_ITEMS(Socket)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Socket)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Socket)m4_dnl +m4_dnl +Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what) +Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where) +Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options) +Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype) +Mount.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Mount, timeout_usec) +Mount.DirectoryMode, config_parse_mode, 0, offsetof(Mount, directory_mode) +Mount.SloppyOptions, config_parse_bool, 0, offsetof(Mount, sloppy_options) +Mount.LazyUnmount, config_parse_bool, 0, offsetof(Mount, lazy_unmount) +Mount.ForceUnmount, config_parse_bool, 0, offsetof(Mount, force_unmount) +Mount.ReadWriteOnly, config_parse_bool, 0, offsetof(Mount, read_write_only) +EXEC_CONTEXT_CONFIG_ITEMS(Mount)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Mount)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Mount)m4_dnl +m4_dnl +Automount.Where, config_parse_unit_path_printf, 0, offsetof(Automount, where) +Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode) +Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec) +m4_dnl +Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what) +Swap.Priority, config_parse_swap_priority, 0, 0 +Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options) +Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec) +EXEC_CONTEXT_CONFIG_ITEMS(Swap)m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Swap)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Swap)m4_dnl +m4_dnl +Timer.OnCalendar, config_parse_timer, TIMER_CALENDAR, 0 +Timer.OnActiveSec, config_parse_timer, TIMER_ACTIVE, 0 +Timer.OnBootSec, config_parse_timer, TIMER_BOOT, 0 +Timer.OnStartupSec, config_parse_timer, TIMER_STARTUP, 0 +Timer.OnUnitActiveSec, config_parse_timer, TIMER_UNIT_ACTIVE, 0 +Timer.OnUnitInactiveSec, config_parse_timer, TIMER_UNIT_INACTIVE, 0 +Timer.OnClockChange, config_parse_bool, 0, offsetof(Timer, on_clock_change) +Timer.OnTimezoneChange, config_parse_bool, 0, offsetof(Timer, on_timezone_change) +Timer.Persistent, config_parse_bool, 0, offsetof(Timer, persistent) +Timer.WakeSystem, config_parse_bool, 0, offsetof(Timer, wake_system) +Timer.RemainAfterElapse, config_parse_bool, 0, offsetof(Timer, remain_after_elapse) +Timer.FixedRandomDelay, config_parse_bool, 0, offsetof(Timer, fixed_random_delay) +Timer.AccuracySec, config_parse_sec, 0, offsetof(Timer, accuracy_usec) +Timer.RandomizedDelaySec, config_parse_sec, 0, offsetof(Timer, random_usec) +Timer.Unit, config_parse_trigger_unit, 0, 0 +m4_dnl +Path.PathExists, config_parse_path_spec, 0, 0 +Path.PathExistsGlob, config_parse_path_spec, 0, 0 +Path.PathChanged, config_parse_path_spec, 0, 0 +Path.PathModified, config_parse_path_spec, 0, 0 +Path.DirectoryNotEmpty, config_parse_path_spec, 0, 0 +Path.Unit, config_parse_trigger_unit, 0, 0 +Path.MakeDirectory, config_parse_bool, 0, offsetof(Path, make_directory) +Path.DirectoryMode, config_parse_mode, 0, offsetof(Path, directory_mode) +m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Slice)m4_dnl +m4_dnl +CGROUP_CONTEXT_CONFIG_ITEMS(Scope)m4_dnl +KILL_CONTEXT_CONFIG_ITEMS(Scope)m4_dnl +Scope.RuntimeMaxSec, config_parse_sec, 0, offsetof(Scope, runtime_max_usec) +Scope.TimeoutStopSec, config_parse_sec, 0, offsetof(Scope, timeout_stop_usec) +m4_dnl The [Install] section is ignored here. +Install.Alias, NULL, 0, 0 +Install.WantedBy, NULL, 0, 0 +Install.RequiredBy, NULL, 0, 0 +Install.Also, NULL, 0, 0 +Install.DefaultInstance, NULL, 0, 0 diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c new file mode 100644 index 0000000..4964249 --- /dev/null +++ b/src/core/load-fragment.c @@ -0,0 +1,5851 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2012 Holger Hans Peter Freyther +***/ + +#include <errno.h> +#include <fcntl.h> +#include <linux/fs.h> +#include <linux/oom.h> +#if HAVE_SECCOMP +#include <seccomp.h> +#endif +#include <sched.h> +#include <sys/resource.h> + +#include "sd-messages.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "all-units.h" +#include "bpf-firewall.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "conf-parser.h" +#include "core-varlink.h" +#include "cpu-set-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "ioprio.h" +#include "ip-protocol-list.h" +#include "journal-file.h" +#include "limits-util.h" +#include "load-fragment.h" +#include "log.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "securebits-util.h" +#include "signal-util.h" +#include "socket-netlink.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "time-util.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "user-util.h" +#include "utf8.h" +#include "web-util.h" + +static int parse_socket_protocol(const char *s) { + int r; + + r = parse_ip_protocol(s); + if (r < 0) + return r; + if (!IN_SET(r, IPPROTO_UDPLITE, IPPROTO_SCTP)) + return -EPROTONOSUPPORT; + + return r; +} + +int parse_crash_chvt(const char *value, int *data) { + int b; + + if (safe_atoi(value, data) >= 0) + return 0; + + b = parse_boolean(value); + if (b < 0) + return b; + + if (b > 0) + *data = 0; /* switch to where kmsg goes */ + else + *data = -1; /* turn off switching */ + + return 0; +} + +int parse_confirm_spawn(const char *value, char **console) { + char *s; + int r; + + r = value ? parse_boolean(value) : 1; + if (r == 0) { + *console = NULL; + return 0; + } else if (r > 0) /* on with default tty */ + s = strdup("/dev/console"); + else if (is_path(value)) /* on with fully qualified path */ + s = strdup(value); + else /* on with only a tty file name, not a fully qualified path */ + s = path_join("/dev/", value); + if (!s) + return -ENOMEM; + + *console = s; + return 0; +} + +DEFINE_CONFIG_PARSE(config_parse_socket_protocol, parse_socket_protocol, "Failed to parse socket protocol"); +DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Failed to parse secure bits"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_home, protect_home, ProtectHome, "Failed to parse protect home value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_system, protect_system, ProtectSystem, "Failed to parse protect system value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_runtime_preserve_mode, exec_preserve_mode, ExecPreserveMode, "Failed to parse runtime directory preserve mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy"); +DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); +DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); +DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); +DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares"); +DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_flags, mount_propagation_flags_from_string, unsigned long, "Failed to parse mount flag"); +DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat, "Failed to parse status unit format"); +DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping, "Failed to parse timestamping precision"); + +int config_parse_unit_deps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + UnitDependency d = ltype; + Unit *u = userdata; + + assert(filename); + assert(lvalue); + assert(rvalue); + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + int r; + + r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_name_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + r = unit_add_dependency_by_name(u, d, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + } +} + +int config_parse_obsolete_unit_deps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit dependency type %s= is obsolete, replacing by %s=, please update your unit file", lvalue, unit_dependency_to_string(ltype)); + + return config_parse_unit_deps(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_unit_string_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_strv_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + return config_parse_strv(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_path_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = userdata; + int r; + bool fatal = ltype; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + /* Let's not bother with anything that is too long */ + if (strlen(rvalue) >= PATH_MAX) { + log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, 0, + "%s value too long%s.", + lvalue, fatal ? "" : ", ignoring"); + return fatal ? -ENAMETOOLONG : 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, fatal ? "" : ", ignoring"); + return fatal ? -ENOEXEC : 0; + } + + return config_parse_path(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_unit_path_strv_printf( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***x = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + *x = strv_free(*x); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_consume(x, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } +} + +static int patch_var_run( + const char *unit, + const char *filename, + unsigned line, + const char *lvalue, + char **path) { + + const char *e; + char *z; + + e = path_startswith(*path, "/var/run/"); + if (!e) + return 0; + + z = path_join("/run/", e); + if (!z) + return log_oom(); + + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "%s= references a path below legacy directory /var/run/, updating %s → %s; " + "please update the unit file accordingly.", lvalue, *path, z); + + free_and_replace(*path, z); + + return 1; +} + +int config_parse_socket_listen( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ SocketPort *p = NULL; + SocketPort *tail; + Socket *s; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + s = SOCKET(data); + + if (isempty(rvalue)) { + /* An empty assignment removes all ports */ + socket_free_ports(s); + return 0; + } + + p = new0(SocketPort, 1); + if (!p) + return log_oom(); + + if (ltype != SOCKET_SOCKET) { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (ltype == SOCKET_FIFO) { + r = patch_var_run(unit, filename, line, lvalue, &k); + if (r < 0) + return r; + } + + free_and_replace(p->path, k); + p->type = ltype; + + } else if (streq(lvalue, "ListenNetlink")) { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = socket_address_parse_netlink(&p->address, k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k); + return 0; + } + + p->type = SOCKET_SOCKET; + + } else { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(UNIT(s), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */ + r = patch_var_run(unit, filename, line, lvalue, &k); + if (r < 0) + return r; + } + + r = socket_address_parse_and_warn(&p->address, k); + if (r < 0) { + if (r != -EAFNOSUPPORT) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k); + return 0; + } + + if (streq(lvalue, "ListenStream")) + p->address.type = SOCK_STREAM; + else if (streq(lvalue, "ListenDatagram")) + p->address.type = SOCK_DGRAM; + else { + assert(streq(lvalue, "ListenSequentialPacket")); + p->address.type = SOCK_SEQPACKET; + } + + if (socket_address_family(&p->address) != AF_LOCAL && p->address.type == SOCK_SEQPACKET) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Address family not supported, ignoring: %s", rvalue); + return 0; + } + + p->type = SOCKET_SOCKET; + } + + p->fd = -1; + p->auxiliary_fds = NULL; + p->n_auxiliary_fds = 0; + p->socket = s; + + LIST_FIND_TAIL(port, s->ports, tail); + LIST_INSERT_AFTER(port, s->ports, tail, p); + + p = NULL; + + return 0; +} + +int config_parse_exec_nice( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int priority, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->nice_set = false; + return 0; + } + + r = parse_nice(rvalue, &priority); + if (r < 0) { + if (r == -ERANGE) + log_syntax(unit, LOG_WARNING, filename, line, r, "Nice priority out of range, ignoring: %s", rvalue); + else + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse nice priority '%s', ignoring: %m", rvalue); + return 0; + } + + c->nice = priority; + c->nice_set = true; + + return 0; +} + +int config_parse_exec_oom_score_adjust( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int oa, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->oom_score_adjust_set = false; + return 0; + } + + r = parse_oom_score_adjust(rvalue, &oa); + if (r < 0) { + if (r == -ERANGE) + log_syntax(unit, LOG_WARNING, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue); + else + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue); + return 0; + } + + c->oom_score_adjust = oa; + c->oom_score_adjust_set = true; + + return 0; +} + +int config_parse_exec_coredump_filter( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->coredump_filter = 0; + c->coredump_filter_set = false; + return 0; + } + + uint64_t f; + r = coredump_filter_mask_from_string(rvalue, &f); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse the CoredumpFilter=%s, ignoring: %m", rvalue); + return 0; + } + + c->coredump_filter |= f; + c->oom_score_adjust_set = true; + return 0; +} + +int config_parse_kill_mode( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + KillMode *k = data, m; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + *k = KILL_CONTROL_GROUP; + return 0; + } + + m = kill_mode_from_string(rvalue); + if (m < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse kill mode specification, ignoring: %s", rvalue); + return 0; + } + + if (m == KILL_NONE) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unit configured to use KillMode=none. " + "This is unsafe, as it disables systemd's process lifecycle management for the service. " + "Please update your service to use a safer KillMode=, such as 'mixed' or 'control-group'. " + "Support for KillMode=none is deprecated and will eventually be removed."); + + *k = m; + return 0; +} + +int config_parse_exec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecCommand **e = data; + const Unit *u = userdata; + const char *p; + bool semicolon; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(e); + + e += ltype; + + if (isempty(rvalue)) { + /* An empty assignment resets the list */ + *e = exec_command_free_list(*e); + return 0; + } + + p = rvalue; + do { + _cleanup_free_ char *path = NULL, *firstword = NULL; + ExecCommandFlags flags = 0; + bool ignore = false, separate_argv0 = false; + _cleanup_free_ ExecCommand *nce = NULL; + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0, nbufsize = 0; + const char *f; + + semicolon = false; + + r = extract_first_word_and_warn(&p, &firstword, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); + if (r <= 0) + return 0; + + /* A lone ";" is a separator. Let's make sure we don't treat it as an executable name. */ + if (streq(firstword, ";")) { + semicolon = true; + continue; + } + + f = firstword; + for (;;) { + /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't + * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of + * argv[0]; if it's prefixed with :, we will not do environment variable substitution; + * if it's prefixed with +, it will be run with full privileges and no sandboxing; if + * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if + * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient + * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most + * other sandboxing, with some special exceptions for changing UID. + * + * The idea is that '!!' may be used to write services that can take benefit of systemd's + * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to + * privilege dropping within the daemon if the kernel does not offer that. */ + + if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) { + flags |= EXEC_COMMAND_IGNORE_FAILURE; + ignore = true; + } else if (*f == '@' && !separate_argv0) + separate_argv0 = true; + else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND)) + flags |= EXEC_COMMAND_NO_ENV_EXPAND; + else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) + flags |= EXEC_COMMAND_FULLY_PRIVILEGED; + else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) + flags |= EXEC_COMMAND_NO_SETUID; + else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) { + flags &= ~EXEC_COMMAND_NO_SETUID; + flags |= EXEC_COMMAND_AMBIENT_MAGIC; + } else + break; + f++; + } + + r = unit_full_printf(u, f, &path); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + f, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + if (isempty(path)) { + /* First word is either "-" or "@" with no command. */ + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Empty path in command line%s: '%s'", + ignore ? ", ignoring" : "", rvalue); + return ignore ? 0 : -ENOEXEC; + } + if (!string_is_safe(path)) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Executable name contains special characters%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + if (endswith(path, "/")) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Executable path specifies a directory%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + + if (!path_is_absolute(path) && !filename_is_valid(path)) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Neither a valid executable name nor an absolute path%s: %s", + ignore ? ", ignoring" : "", path); + return ignore ? 0 : -ENOEXEC; + } + + if (!separate_argv0) { + char *w = NULL; + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + w = strdup(path); + if (!w) + return log_oom(); + n[nlen++] = w; + n[nlen] = NULL; + } + + path_simplify(path, false); + + while (!isempty(p)) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + /* Check explicitly for an unquoted semicolon as + * command separator token. */ + if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) { + p++; + p += strspn(p, WHITESPACE); + semicolon = true; + break; + } + + /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc. + * extract_first_word() would return the same for all of those. */ + if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) { + char *w; + + p += 2; + p += strspn(p, WHITESPACE); + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + w = strdup(";"); + if (!w) + return log_oom(); + n[nlen++] = w; + n[nlen] = NULL; + continue; + } + + r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); + if (r == 0) + break; + if (r < 0) + return ignore ? 0 : -ENOEXEC; + + r = unit_full_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in %s%s: %m", + word, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(resolved); + n[nlen] = NULL; + } + + if (!n || !n[0]) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, + "Empty executable name or zeroeth argument%s: %s", + ignore ? ", ignoring" : "", rvalue); + return ignore ? 0 : -ENOEXEC; + } + + nce = new0(ExecCommand, 1); + if (!nce) + return log_oom(); + + nce->argv = TAKE_PTR(n); + nce->path = TAKE_PTR(path); + nce->flags = flags; + + exec_command_append_list(e, nce); + + /* Do not _cleanup_free_ these. */ + nce = NULL; + + rvalue = p; + } while (semicolon); + + return 0; +} + +int config_parse_socket_bindtodevice( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Socket *s = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue) || streq(rvalue, "*")) { + s->bind_to_device = mfree(s->bind_to_device); + return 0; + } + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", rvalue); + return 0; + } + + if (free_and_strdup(&s->bind_to_device, rvalue) < 0) + return log_oom(); + + return 0; +} + +int config_parse_exec_input( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + const char *n; + ExecInput ei; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + n = startswith(rvalue, "fd:"); + if (n) { + _cleanup_free_ char *resolved = NULL; + + r = unit_full_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n); + return 0; + } + + if (isempty(resolved)) + resolved = mfree(resolved); + else if (!fdname_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved); + return 0; + } + + free_and_replace(c->stdio_fdname[STDIN_FILENO], resolved); + + ei = EXEC_INPUT_NAMED_FD; + + } else if ((n = startswith(rvalue, "file:"))) { + _cleanup_free_ char *resolved = NULL; + + r = unit_full_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + free_and_replace(c->stdio_file[STDIN_FILENO], resolved); + + ei = EXEC_INPUT_FILE; + + } else { + ei = exec_input_from_string(rvalue); + if (ei < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse input specifier, ignoring: %s", rvalue); + return 0; + } + } + + c->std_input = ei; + return 0; +} + +int config_parse_exec_input_text( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *unescaped = NULL, *resolved = NULL; + ExecContext *c = data; + const Unit *u = userdata; + size_t sz; + void *p; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + return 0; + } + + r = cunescape(rvalue, 0, &unescaped); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to decode C escaped text '%s', ignoring: %m", rvalue); + return 0; + } + + r = unit_full_printf(u, unescaped, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", unescaped); + return 0; + } + + sz = strlen(resolved); + if (c->stdin_data_size + sz + 1 < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz + 1 > EXEC_STDIN_DATA_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Standard input data too large (%zu), maximum of %zu permitted, ignoring.", + c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX); + return 0; + } + + p = realloc(c->stdin_data, c->stdin_data_size + sz + 1); + if (!p) + return log_oom(); + + *((char*) mempcpy((char*) p + c->stdin_data_size, resolved, sz)) = '\n'; + + c->stdin_data = p; + c->stdin_data_size += sz + 1; + + return 0; +} + +int config_parse_exec_input_data( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *p = NULL; + ExecContext *c = data; + size_t sz; + void *q; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->stdin_data = mfree(c->stdin_data); + c->stdin_data_size = 0; + return 0; + } + + r = unbase64mem(rvalue, (size_t) -1, &p, &sz); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to decode base64 data, ignoring: %s", rvalue); + return 0; + } + + assert(sz > 0); + + if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */ + c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Standard input data too large (%zu), maximum of %zu permitted, ignoring.", + c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX); + return 0; + } + + q = realloc(c->stdin_data, c->stdin_data_size + sz); + if (!q) + return log_oom(); + + memcpy((uint8_t*) q + c->stdin_data_size, p, sz); + + c->stdin_data = q; + c->stdin_data_size += sz; + + return 0; +} + +int config_parse_exec_output( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + const char *n; + ExecContext *c = data; + const Unit *u = userdata; + bool obsolete = false; + ExecOutput eo; + int r; + + assert(data); + assert(filename); + assert(line); + assert(lvalue); + assert(rvalue); + + n = startswith(rvalue, "fd:"); + if (n) { + r = unit_full_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n); + return 0; + } + + if (isempty(resolved)) + resolved = mfree(resolved); + else if (!fdname_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved); + return 0; + } + + eo = EXEC_OUTPUT_NAMED_FD; + + } else if (streq(rvalue, "syslog")) { + eo = EXEC_OUTPUT_JOURNAL; + obsolete = true; + + } else if (streq(rvalue, "syslog+console")) { + eo = EXEC_OUTPUT_JOURNAL_AND_CONSOLE; + obsolete = true; + + } else if ((n = startswith(rvalue, "file:"))) { + + r = unit_full_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + eo = EXEC_OUTPUT_FILE; + + } else if ((n = startswith(rvalue, "append:"))) { + + r = unit_full_printf(u, n, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue); + if (r < 0) + return 0; + + eo = EXEC_OUTPUT_FILE_APPEND; + } else { + eo = exec_output_from_string(rvalue); + if (eo < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse output specifier, ignoring: %s", rvalue); + return 0; + } + } + + if (obsolete) + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "Standard output type %s is obsolete, automatically updating to %s. Please update your unit file, and consider removing the setting altogether.", + rvalue, exec_output_to_string(eo)); + + if (streq(lvalue, "StandardOutput")) { + if (eo == EXEC_OUTPUT_NAMED_FD) + free_and_replace(c->stdio_fdname[STDOUT_FILENO], resolved); + else + free_and_replace(c->stdio_file[STDOUT_FILENO], resolved); + + c->std_output = eo; + + } else { + assert(streq(lvalue, "StandardError")); + + if (eo == EXEC_OUTPUT_NAMED_FD) + free_and_replace(c->stdio_fdname[STDERR_FILENO], resolved); + else + free_and_replace(c->stdio_file[STDERR_FILENO], resolved); + + c->std_error = eo; + } + + return 0; +} + +int config_parse_exec_io_class(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int x; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->ioprio_set = false; + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0); + return 0; + } + + x = ioprio_class_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse IO scheduling class, ignoring: %s", rvalue); + return 0; + } + + c->ioprio = IOPRIO_PRIO_VALUE(x, IOPRIO_PRIO_DATA(c->ioprio)); + c->ioprio_set = true; + + return 0; +} + +int config_parse_exec_io_priority(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int i, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->ioprio_set = false; + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0); + return 0; + } + + r = ioprio_parse_priority(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse IO priority, ignoring: %s", rvalue); + return 0; + } + + c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_PRIO_CLASS(c->ioprio), i); + c->ioprio_set = true; + + return 0; +} + +int config_parse_exec_cpu_sched_policy(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int x; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->cpu_sched_set = false; + c->cpu_sched_policy = SCHED_OTHER; + c->cpu_sched_priority = 0; + return 0; + } + + x = sched_policy_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse CPU scheduling policy, ignoring: %s", rvalue); + return 0; + } + + c->cpu_sched_policy = x; + /* Moving to or from real-time policy? We need to adjust the priority */ + c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(x), sched_get_priority_max(x)); + c->cpu_sched_set = true; + + return 0; +} + +int config_parse_exec_mount_apivfs(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int k; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->mount_apivfs_set = false; + c->mount_apivfs = false; + return 0; + } + + k = parse_boolean(rvalue); + if (k < 0) { + log_syntax(unit, LOG_WARNING, filename, line, k, + "Failed to parse boolean value, ignoring: %s", + rvalue); + return 0; + } + + c->mount_apivfs_set = true; + c->mount_apivfs = k; + return 0; +} + +int config_parse_numa_mask(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + int r; + NUMAPolicy *p = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(rvalue, "all")) { + r = numa_mask_add_all(&p->nodes); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create NUMA mask representing \"all\" NUMA nodes, ignoring: %m"); + } else { + r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue); + } + + return 0; +} + +int config_parse_exec_cpu_sched_prio(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int i, min, max, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = safe_atoi(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CPU scheduling priority, ignoring: %s", rvalue); + return 0; + } + + /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0 */ + min = sched_get_priority_min(c->cpu_sched_policy); + max = sched_get_priority_max(c->cpu_sched_policy); + + if (i < min || i > max) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "CPU scheduling priority is out of range, ignoring: %s", rvalue); + return 0; + } + + c->cpu_sched_priority = i; + c->cpu_sched_set = true; + + return 0; +} + +int config_parse_root_image_options( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_strv_free_ char **l = NULL; + char **first = NULL, **second = NULL; + ExecContext *c = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->root_image_options = mount_options_free_all(c->root_image_options); + return 0; + } + + r = strv_split_colon_pairs(&l, rvalue); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + + STRV_FOREACH_PAIR(first, second, l) { + MountOptions *o = NULL; + _cleanup_free_ char *mount_options_resolved = NULL; + const char *mount_options = NULL, *partition = "root"; + PartitionDesignator partition_designator; + + /* Format is either 'root:foo' or 'foo' (root is implied) */ + if (!isempty(*second)) { + partition = *first; + mount_options = *second; + } else + mount_options = *first; + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid partition name %s, ignoring", partition); + continue; + } + r = unit_full_printf(u, mount_options, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + + /* empty spaces/separators only */ + if (LIST_IS_EMPTY(options)) + c->root_image_options = mount_options_free_all(c->root_image_options); + else + LIST_JOIN(mount_options, c->root_image_options, options); + + return 0; +} + +int config_parse_exec_root_hash( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *roothash_decoded = NULL; + ExecContext *c = data; + size_t roothash_decoded_size = 0; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->root_hash_path = mfree(c->root_hash_path); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + return 0; + } + + if (path_is_absolute(rvalue)) { + /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */ + _cleanup_free_ char *p = NULL; + + p = strdup(rvalue); + if (!p) + return -ENOMEM; + + free_and_replace(c->root_hash_path, p); + c->root_hash = mfree(c->root_hash); + c->root_hash_size = 0; + return 0; + } + + /* We have a roothash to decode, eg: RootHash=012345789abcdef */ + r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue); + return 0; + } + if (roothash_decoded_size < sizeof(sd_id128_t)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "RootHash= is too short, ignoring: %s", rvalue); + return 0; + } + + free_and_replace(c->root_hash, roothash_decoded); + c->root_hash_size = roothash_decoded_size; + c->root_hash_path = mfree(c->root_hash_path); + + return 0; +} + +int config_parse_exec_root_hash_sig( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ void *roothash_sig_decoded = NULL; + char *value; + ExecContext *c = data; + size_t roothash_sig_decoded_size = 0; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + if (isempty(rvalue)) { + /* Reset if the empty string is assigned */ + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + return 0; + } + + if (path_is_absolute(rvalue)) { + /* We have the path to a roothash signature to load and decode, eg: RootHashSignature=/foo/bar.roothash.p7s */ + _cleanup_free_ char *p = NULL; + + p = strdup(rvalue); + if (!p) + return log_oom(); + + free_and_replace(c->root_hash_sig_path, p); + c->root_hash_sig = mfree(c->root_hash_sig); + c->root_hash_sig_size = 0; + return 0; + } + + if (!(value = startswith(rvalue, "base64:"))) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to decode RootHashSignature=, not a path but doesn't start with 'base64:', ignoring: %s", rvalue); + return 0; + } + + /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */ + r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue); + return 0; + } + + free_and_replace(c->root_hash_sig, roothash_sig_decoded); + c->root_hash_sig_size = roothash_sig_decoded_size; + c->root_hash_sig_path = mfree(c->root_hash_sig_path); + + return 0; +} + +int config_parse_exec_cpu_affinity(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(rvalue, "numa")) { + c->cpu_affinity_from_numa = true; + cpu_set_reset(&c->cpu_set); + + return 0; + } + + r = parse_cpu_set_extend(rvalue, &c->cpu_set, true, unit, filename, line, lvalue); + if (r >= 0) + c->cpu_affinity_from_numa = false; + + return r; +} + +int config_parse_capability_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *capability_set = data; + uint64_t sum = 0, initial = 0; + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (streq(lvalue, "CapabilityBoundingSet")) + initial = CAP_ALL; /* initialized to all bits on */ + /* else "AmbientCapabilities" initialized to all bits off */ + + r = capability_set_from_string(rvalue, &sum); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= specifier '%s', ignoring: %m", lvalue, rvalue); + return 0; + } + + if (sum == 0 || *capability_set == initial) + /* "", "~" or uninitialized data -> replace */ + *capability_set = invert ? ~sum : sum; + else { + /* previous data -> merge */ + if (invert) + *capability_set &= ~sum; + else + *capability_set |= sum; + } + + return 0; +} + +int config_parse_exec_selinux_context( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->selinux_context = mfree(c->selinux_context); + c->selinux_context_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->selinux_context, k); + c->selinux_context_ignore = ignore; + + return 0; +} + +int config_parse_exec_apparmor_profile( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->apparmor_profile = mfree(c->apparmor_profile); + c->apparmor_profile_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->apparmor_profile, k); + c->apparmor_profile_ignore = ignore; + + return 0; +} + +int config_parse_exec_smack_process_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + bool ignore; + char *k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + c->smack_process_label = mfree(c->smack_process_label); + c->smack_process_label_ignore = false; + return 0; + } + + if (rvalue[0] == '-') { + ignore = true; + rvalue++; + } else + ignore = false; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in '%s'%s: %m", + rvalue, ignore ? ", ignoring" : ""); + return ignore ? 0 : -ENOEXEC; + } + + free_and_replace(c->smack_process_label, k); + c->smack_process_label_ignore = ignore; + + return 0; +} + +int config_parse_timer( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + _cleanup_free_ char *k = NULL; + const Unit *u = userdata; + Timer *t = data; + usec_t usec = 0; + TimerValue *v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets list */ + timer_free_values(t); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (ltype == TIMER_CALENDAR) { + r = calendar_spec_from_string(k, &c); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse calendar specification, ignoring: %s", k); + return 0; + } + } else { + r = parse_sec(k, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", k); + return 0; + } + } + + v = new(TimerValue, 1); + if (!v) + return log_oom(); + + *v = (TimerValue) { + .base = ltype, + .value = usec, + .calendar_spec = TAKE_PTR(c), + }; + + LIST_PREPEND(value, t->values, v); + + return 0; +} + +int config_parse_trigger_unit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Unit *u = data; + UnitType type; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (!hashmap_isempty(u->dependencies[UNIT_TRIGGERS])) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Multiple units to trigger specified, ignoring: %s", rvalue); + return 0; + } + + r = unit_name_printf(u, rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + type = unit_name_to_type(p); + if (type < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit type not valid, ignoring: %s", rvalue); + return 0; + } + if (unit_has_name(u, p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Units cannot trigger themselves, ignoring: %s", rvalue); + return 0; + } + + r = unit_add_two_dependencies_by_name(u, UNIT_BEFORE, UNIT_TRIGGERS, p, true, UNIT_DEPENDENCY_FILE); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add trigger on %s, ignoring: %m", p); + return 0; + } + + return 0; +} + +int config_parse_path_spec(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Path *p = data; + PathSpec *s; + PathType b; + _cleanup_free_ char *k = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment clears list */ + path_free_specs(p); + return 0; + } + + b = path_type_from_string(lvalue); + if (b < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse path type, ignoring: %s", lvalue); + return 0; + } + + r = unit_full_printf(UNIT(p), rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + s = new0(PathSpec, 1); + if (!s) + return log_oom(); + + s->unit = UNIT(p); + s->path = TAKE_PTR(k); + s->type = b; + s->inotify_fd = -1; + + LIST_PREPEND(spec, p->specs, s); + + return 0; +} + +int config_parse_socket_service( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *p = NULL; + Socket *s = data; + Unit *x; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = unit_name_printf(UNIT(s), rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!endswith(p, ".service")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type service, ignoring: %s", rvalue); + return 0; + } + + r = manager_load_unit(UNIT(s)->manager, p, NULL, &error, &x); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load unit %s, ignoring: %s", rvalue, bus_error_message(&error, r)); + return 0; + } + + unit_ref_set(&s->service, UNIT(s), x); + + return 0; +} + +int config_parse_fdname( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Socket *s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + s->fdname = mfree(s->fdname); + return 0; + } + + r = unit_full_printf(UNIT(s), rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (!fdname_is_valid(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", p); + return 0; + } + + return free_and_replace(s->fdname, p); +} + +int config_parse_service_sockets( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Trailing garbage in sockets, ignoring: %s", rvalue); + return 0; + } + + r = unit_name_printf(UNIT(s), word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + if (!endswith(k, ".socket")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type socket, ignoring: %s", k); + continue; + } + + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_WANTS, UNIT_AFTER, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + + r = unit_add_dependency_by_name(UNIT(s), UNIT_TRIGGERED_BY, k, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k); + } +} + +int config_parse_bus_name( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!sd_bus_service_name_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid bus name, ignoring: %s", k); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + +int config_parse_service_timeout( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = userdata; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(s); + + /* This is called for two cases: TimeoutSec= and TimeoutStartSec=. */ + + /* Traditionally, these options accepted 0 to disable the timeouts. However, a timeout of 0 suggests it happens + * immediately, hence fix this to become USEC_INFINITY instead. This is in-line with how we internally handle + * all other timeouts. */ + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue); + return 0; + } + + s->start_timeout_defined = true; + s->timeout_start_usec = usec; + + if (streq(lvalue, "TimeoutSec")) + s->timeout_stop_usec = usec; + + return 0; +} + +int config_parse_timeout_abort( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t *ret = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(ret); + + /* Note: apart from setting the arg, this returns an extra bit of information in the return value. */ + + if (isempty(rvalue)) { + *ret = 0; + return 0; /* "not set" */ + } + + r = parse_sec(rvalue, ret); + if (r < 0) + return log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= setting, ignoring: %s", lvalue, rvalue); + + return 1; /* "set" */ +} + +int config_parse_service_timeout_abort( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Service *s = userdata; + int r; + + assert(s); + + r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue, + &s->timeout_abort_usec, s); + if (r >= 0) + s->timeout_abort_set = r; + return 0; +} + +int config_parse_sec_fix_0( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t *usec = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(usec); + + /* This is pretty much like config_parse_sec(), except that this treats a time of 0 as infinity, for + * compatibility with older versions of systemd where 0 instead of infinity was used as indicator to turn off a + * timeout. */ + + r = parse_sec_fix_0(rvalue, usec); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue); + + return 0; +} + +int config_parse_user_group_compat( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + char **user = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + *user = mfree(*user); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue); + return -ENOEXEC; + } + + if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k); + return -ENOEXEC; + } + + if (strstr(lvalue, "User") && streq(k, NOBODY_USER_NAME)) + log_struct(LOG_NOTICE, + "MESSAGE=%s:%u: Special user %s configured, this is not safe!", filename, line, k, + "UNIT=%s", unit, + "MESSAGE_ID=" SD_MESSAGE_NOBODY_USER_UNSUITABLE_STR, + "OFFENDING_USER=%s", k, + "CONFIG_FILE=%s", filename, + "CONFIG_LINE=%u", line); + + return free_and_replace(*user, k); +} + +int config_parse_user_group_strv_compat( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***users = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + *users = strv_free(*users); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax: %s", rvalue); + return -ENOEXEC; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", word); + return -ENOEXEC; + } + + if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k); + return -ENOEXEC; + } + + r = strv_push(users, k); + if (r < 0) + return log_oom(); + + k = NULL; + } +} + +int config_parse_working_directory( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + bool missing_ok; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(c); + assert(u); + + if (isempty(rvalue)) { + c->working_directory_home = false; + c->working_directory = mfree(c->working_directory); + return 0; + } + + if (rvalue[0] == '-') { + missing_ok = true; + rvalue++; + } else + missing_ok = false; + + if (streq(rvalue, "~")) { + c->working_directory_home = true; + c->working_directory = mfree(c->working_directory); + } else { + _cleanup_free_ char *k = NULL; + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, missing_ok ? LOG_WARNING : LOG_ERR, filename, line, r, + "Failed to resolve unit specifiers in working directory path '%s'%s: %m", + rvalue, missing_ok ? ", ignoring" : ""); + return missing_ok ? 0 : -ENOEXEC; + } + + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue); + if (r < 0) + return missing_ok ? 0 : -ENOEXEC; + + c->working_directory_home = false; + free_and_replace(c->working_directory, k); + } + + c->working_directory_missing_ok = missing_ok; + return 0; +} + +int config_parse_unit_env_file(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***env = data; + const Unit *u = userdata; + _cleanup_free_ char *n = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment frees the list */ + *env = strv_free(*env); + return 0; + } + + r = unit_full_printf(u, rvalue, &n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(n[0] == '-' ? n + 1 : n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_push(env, n); + if (r < 0) + return log_oom(); + + n = NULL; + + return 0; +} + +int config_parse_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + char ***env = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *env = strv_free(*env); + return 0; + } + + for (const char *p = rvalue;; ) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + if (u) { + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_assignment_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid environment assignment, ignoring: %s", k); + continue; + } + + r = strv_env_replace(env, k); + if (r < 0) + return log_oom(); + + k = NULL; + } +} + +int config_parse_pass_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0, nbufsize = 0; + char*** passenv = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *passenv = strv_free(*passenv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + + if (u) { + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_name_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid environment name for %s, ignoring: %s", lvalue, k); + continue; + } + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(k); + n[nlen] = NULL; + } + + if (n) { + r = strv_extend_strv(passenv, n, true); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_unset_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **n = NULL; + size_t nlen = 0, nbufsize = 0; + char*** unsetenv = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *unsetenv = strv_free(*unsetenv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Trailing garbage in %s, ignoring: %s", lvalue, rvalue); + break; + } + + if (u) { + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + } else + k = TAKE_PTR(word); + + if (!env_assignment_is_valid(k) && !env_name_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid environment name or assignment %s, ignoring: %s", lvalue, k); + continue; + } + + if (!GREEDY_REALLOC(n, nbufsize, nlen + 2)) + return log_oom(); + + n[nlen++] = TAKE_PTR(k); + n[nlen] = NULL; + } + + if (n) { + r = strv_extend_strv(unsetenv, n, true); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_log_extra_fields( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(c); + + if (isempty(rvalue)) { + exec_context_free_log_extra_fields(c); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + struct iovec *t; + const char *eq; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", word); + continue; + } + + eq = strchr(k, '='); + if (!eq) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field lacks '=' character, ignoring: %s", k); + continue; + } + + if (!journal_field_valid(k, eq-k, false)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field name is invalid, ignoring: %s", k); + continue; + } + + t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec)); + if (!t) + return log_oom(); + + c->log_extra_fields = t; + c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE_STRING(k); + + k = NULL; + } +} + +int config_parse_log_namespace( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + ExecContext *c = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(c); + + if (isempty(rvalue)) { + c->log_namespace = mfree(c->log_namespace); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + if (!log_namespace_name_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Specified log namespace name is not valid, ignoring: %s", k); + return 0; + } + + free_and_replace(c->log_namespace, k); + return 0; +} + +int config_parse_unit_condition_path( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *p = NULL; + Condition **list = data, *c; + ConditionType t = ltype; + bool trigger, negate; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = rvalue[0] == '|'; + if (trigger) + rvalue++; + + negate = rvalue[0] == '!'; + if (negate) + rvalue++; + + r = unit_full_printf(u, rvalue, &p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(p, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + c = condition_new(t, p, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_condition_string( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *s = NULL; + Condition **list = data, *c; + ConditionType t = ltype; + bool trigger, negate; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *list = condition_free_list(*list); + return 0; + } + + trigger = *rvalue == '|'; + if (trigger) + rvalue += 1 + strspn(rvalue + 1, WHITESPACE); + + negate = *rvalue == '!'; + if (negate) + rvalue += 1 + strspn(rvalue + 1, WHITESPACE); + + r = unit_full_printf(u, rvalue, &s); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + c = condition_new(t, s, trigger, negate); + if (!c) + return log_oom(); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_unit_requires_mounts_for( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved); + continue; + } + } +} + +int config_parse_documentation(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = userdata; + int r; + char **a, **b; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + u->documentation = strv_free(u->documentation); + return 0; + } + + r = config_parse_unit_strv_printf(unit, filename, line, section, section_line, lvalue, ltype, + rvalue, data, userdata); + if (r < 0) + return r; + + for (a = b = u->documentation; a && *a; a++) { + + if (documentation_url_is_valid(*a)) + *(b++) = *a; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid URL, ignoring: %s", *a); + free(*a); + } + } + if (b) + *b = NULL; + + return r; +} + +#if HAVE_SECCOMP +int config_parse_syscall_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + _unused_ const Unit *u = userdata; + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->syscall_filter = hashmap_free(c->syscall_filter); + c->syscall_allow_list = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->syscall_filter) { + c->syscall_filter = hashmap_new(NULL); + if (!c->syscall_filter) + return log_oom(); + + if (invert) + /* Allow everything but the ones listed */ + c->syscall_allow_list = false; + else { + /* Allow nothing but the ones listed */ + c->syscall_allow_list = true; + + /* Accept default syscalls if we are on a allow_list */ + r = seccomp_parse_syscall_filter( + "@default", -1, c->syscall_filter, + SECCOMP_PARSE_PERMISSIVE|SECCOMP_PARSE_ALLOW_LIST, + unit, + NULL, 0); + if (r < 0) + return r; + } + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *name = NULL; + int num; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = parse_syscall_and_errno(word, &name, &num); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse syscall:errno, ignoring: %s", word); + continue; + } + + r = seccomp_parse_syscall_filter( + name, num, c->syscall_filter, + SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE| + (invert ? SECCOMP_PARSE_INVERT : 0)| + (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + unit, filename, line); + if (r < 0) + return r; + } +} + +int config_parse_syscall_log( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + _unused_ const Unit *u = userdata; + bool invert = false; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->syscall_log = hashmap_free(c->syscall_log); + c->syscall_log_allow_list = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->syscall_log) { + c->syscall_log = hashmap_new(NULL); + if (!c->syscall_log) + return log_oom(); + + if (invert) + /* Log everything but the ones listed */ + c->syscall_log_allow_list = false; + else + /* Log nothing but the ones listed */ + c->syscall_log_allow_list = true; + } + + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL, *name = NULL; + int num; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = parse_syscall_and_errno(word, &name, &num); + if (r < 0 || num >= 0) { /* errno code not allowed */ + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse syscall, ignoring: %s", word); + continue; + } + + r = seccomp_parse_syscall_filter( + name, 0, c->syscall_log, + SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE| + (invert ? SECCOMP_PARSE_INVERT : 0)| + (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0), + unit, filename, line); + if (r < 0) + return r; + } +} + +int config_parse_syscall_archs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **archs = data; + int r; + + if (isempty(rvalue)) { + *archs = set_free(*archs); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + uint32_t a; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = seccomp_arch_from_string(word, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse system call architecture \"%s\", ignoring: %m", word); + continue; + } + + r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_syscall_errno( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + int e; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "kill")) { + /* Empty assignment resets to KILL */ + c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL; + return 0; + } + + e = parse_errno(rvalue); + if (e <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse error number, ignoring: %s", rvalue); + return 0; + } + + c->syscall_errno = e; + return 0; +} + +int config_parse_address_families( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + bool invert = false; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->address_families = set_free(c->address_families); + c->address_families_allow_list = false; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + if (!c->address_families) { + c->address_families = set_new(NULL); + if (!c->address_families) + return log_oom(); + + c->address_families_allow_list = !invert; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + int af; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + af = af_from_name(word); + if (af < 0) { + log_syntax(unit, LOG_WARNING, filename, line, af, + "Failed to parse address family, ignoring: %s", word); + continue; + } + + /* If we previously wanted to forbid an address family and now + * we want to allow it, then just remove it from the list. + */ + if (!invert == c->address_families_allow_list) { + r = set_put(c->address_families, INT_TO_PTR(af)); + if (r < 0) + return log_oom(); + } else + set_remove(c->address_families, INT_TO_PTR(af)); + } +} + +int config_parse_restrict_namespaces( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + unsigned long flags; + bool invert = false; + int r; + + if (isempty(rvalue)) { + /* Reset to the default. */ + c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL; + return 0; + } + + /* Boolean parameter ignores the previous settings */ + r = parse_boolean(rvalue); + if (r > 0) { + c->restrict_namespaces = 0; + return 0; + } else if (r == 0) { + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; + return 0; + } + + if (rvalue[0] == '~') { + invert = true; + rvalue++; + } + + /* Not a boolean argument, in this case it's a list of namespace types. */ + r = namespace_flags_from_string(rvalue, &flags); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue); + return 0; + } + + if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL) + /* Initial assignment. Just set the value. */ + c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags; + else + /* Merge the value with the previous one. */ + SET_FLAG(c->restrict_namespaces, flags, !invert); + + return 0; +} +#endif + +int config_parse_unit_slice( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *k = NULL; + Unit *u = userdata, *slice; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = unit_name_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue); + return 0; + } + + r = manager_load_unit(u->manager, k, NULL, &error, &slice); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load slice unit %s, ignoring: %s", k, bus_error_message(&error, r)); + return 0; + } + + r = unit_set_slice(u, slice); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to assign slice %s to unit %s, ignoring: %m", slice->id, u->id); + return 0; + } + + return 0; +} + +int config_parse_cpu_quota( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + c->cpu_quota_per_sec_usec = USEC_INFINITY; + return 0; + } + + r = parse_permille_unbounded(rvalue); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid CPU quota '%s', ignoring.", rvalue); + return 0; + } + + c->cpu_quota_per_sec_usec = ((usec_t) r * USEC_PER_SEC) / 1000U; + return 0; +} + +int config_parse_allowed_cpus( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + + (void) parse_cpu_set_extend(rvalue, &c->cpuset_cpus, true, unit, filename, line, lvalue); + + return 0; +} + +int config_parse_allowed_mems( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + + (void) parse_cpu_set_extend(rvalue, &c->cpuset_mems, true, unit, filename, line, lvalue); + + return 0; +} + +int config_parse_memory_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + uint64_t bytes = CGROUP_LIMIT_MAX; + int r; + + if (isempty(rvalue) && STR_IN_SET(lvalue, "DefaultMemoryLow", + "DefaultMemoryMin", + "MemoryLow", + "MemoryMin")) + bytes = CGROUP_LIMIT_MIN; + else if (!isempty(rvalue) && !streq(rvalue, "infinity")) { + + r = parse_permille(rvalue); + if (r < 0) { + r = parse_size(rvalue, 1024, &bytes); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid memory limit '%s', ignoring: %m", rvalue); + return 0; + } + } else + bytes = physical_memory_scale(r, 1000U); + + if (bytes >= UINT64_MAX || + (bytes <= 0 && !STR_IN_SET(lvalue, "MemorySwapMax", "MemoryLow", "MemoryMin", "DefaultMemoryLow", "DefaultMemoryMin"))) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Memory limit '%s' out of range, ignoring.", rvalue); + return 0; + } + } + + if (streq(lvalue, "DefaultMemoryLow")) { + c->default_memory_low = bytes; + c->default_memory_low_set = true; + } else if (streq(lvalue, "DefaultMemoryMin")) { + c->default_memory_min = bytes; + c->default_memory_min_set = true; + } else if (streq(lvalue, "MemoryMin")) { + c->memory_min = bytes; + c->memory_min_set = true; + } else if (streq(lvalue, "MemoryLow")) { + c->memory_low = bytes; + c->memory_low_set = true; + } else if (streq(lvalue, "MemoryHigh")) + c->memory_high = bytes; + else if (streq(lvalue, "MemoryMax")) + c->memory_max = bytes; + else if (streq(lvalue, "MemorySwapMax")) + c->memory_swap_max = bytes; + else if (streq(lvalue, "MemoryLimit")) + c->memory_limit = bytes; + else + return -EINVAL; + + return 0; +} + +int config_parse_tasks_max( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + TasksMax *tasks_max = data; + uint64_t v; + int r; + + if (isempty(rvalue)) { + *tasks_max = u ? u->manager->default_tasks_max : TASKS_MAX_UNSET; + return 0; + } + + if (streq(rvalue, "infinity")) { + *tasks_max = TASKS_MAX_UNSET; + return 0; + } + + r = parse_permille(rvalue); + if (r >= 0) + *tasks_max = (TasksMax) { r, 1000U }; /* r‰ */ + else { + r = safe_atou64(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue); + return 0; + } + + if (v <= 0 || v >= UINT64_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue); + return 0; + } + + *tasks_max = (TasksMax) { v }; + } + + return 0; +} + +int config_parse_delegate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_delegate) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Delegate= setting not supported for this unit type, ignoring."); + return 0; + } + + /* We either accept a boolean value, which may be used to turn on delegation for all controllers, or turn it + * off for all. Or it takes a list of controller names, in which case we add the specified controllers to the + * mask to delegate. */ + + if (isempty(rvalue)) { + /* An empty string resets controllers and set Delegate=yes. */ + c->delegate = true; + c->delegate_controllers = 0; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + CGroupMask mask = 0; + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + CGroupController cc; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + cc = cgroup_controller_from_string(word); + if (cc < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid controller name '%s', ignoring", word); + continue; + } + + mask |= CGROUP_CONTROLLER_TO_MASK(cc); + } + + c->delegate = true; + c->delegate_controllers |= mask; + + } else if (r > 0) { + c->delegate = true; + c->delegate_controllers = _CGROUP_MASK_ALL; + } else { + c->delegate = false; + c->delegate_controllers = 0; + } + + return 0; +} + +int config_parse_managed_oom_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + ManagedOOMMode *mode = data, m; + UnitType t; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *mode = MANAGED_OOM_AUTO; + return 0; + } + + m = managed_oom_mode_from_string(rvalue); + if (m < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + *mode = m; + return 0; +} + +int config_parse_managed_oom_mem_pressure_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + int *limit = data; + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *limit = 0; + return 0; + } + + r = parse_percent(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse limit percent value, ignoring: %s", rvalue); + return 0; + } + + *limit = r; + return 0; +} + +int config_parse_device_allow( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupContext *c = data; + const char *p = rvalue; + int r; + + if (isempty(rvalue)) { + while (c->device_allow) + cgroup_context_free_device_allow(c, c->device_allow); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device path and rights from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + if (!STARTSWITH_SET(resolved, "block-", "char-")) { + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (!valid_device_node_path(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device node path '%s', ignoring.", resolved); + return 0; + } + } + + if (!isempty(p) && !in_charset(p, "rwm")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device rights '%s', ignoring.", p); + return 0; + } + + return cgroup_add_device_allow(c, resolved, p); +} + +int config_parse_io_device_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceWeight *w; + CGroupContext *c = data; + const char *p = rvalue; + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->io_device_weights) + cgroup_context_free_io_device_weight(c, c->io_device_weights); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device path and weight from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cg_weight_parse(p, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "IO weight '%s' invalid, ignoring: %m", p); + return 0; + } + + assert(u != CGROUP_WEIGHT_INVALID); + + w = new0(CGroupIODeviceWeight, 1); + if (!w) + return log_oom(); + + w->path = TAKE_PTR(resolved); + w->weight = u; + + LIST_PREPEND(device_weights, c->io_device_weights, w); + return 0; +} + +int config_parse_io_device_latency( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceLatency *l; + CGroupContext *c = data; + const char *p = rvalue; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->io_device_latencies) + cgroup_context_free_io_device_latency(c, c->io_device_latencies); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device path and latency from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = parse_sec(p, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", p); + return 0; + } + + l = new0(CGroupIODeviceLatency, 1); + if (!l) + return log_oom(); + + l->path = TAKE_PTR(resolved); + l->target_usec = usec; + + LIST_PREPEND(device_latencies, c->io_device_latencies, l); + return 0; +} + +int config_parse_io_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupIODeviceLimit *l = NULL, *t; + CGroupContext *c = data; + CGroupIOLimitType type; + const char *p = rvalue; + uint64_t num; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + type = cgroup_io_limit_type_from_string(lvalue); + assert(type >= 0); + + if (isempty(rvalue)) { + LIST_FOREACH(device_limits, l, c->io_device_limits) + l->limits[type] = cgroup_io_limit_defaults[type]; + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (streq("infinity", p)) + num = CGROUP_LIMIT_MAX; + else { + r = parse_size(p, 1000, &num); + if (r < 0 || num <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid IO limit '%s', ignoring.", p); + return 0; + } + } + + LIST_FOREACH(device_limits, t, c->io_device_limits) { + if (path_equal(resolved, t->path)) { + l = t; + break; + } + } + + if (!l) { + CGroupIOLimitType ttype; + + l = new0(CGroupIODeviceLimit, 1); + if (!l) + return log_oom(); + + l->path = TAKE_PTR(resolved); + for (ttype = 0; ttype < _CGROUP_IO_LIMIT_TYPE_MAX; ttype++) + l->limits[ttype] = cgroup_io_limit_defaults[ttype]; + + LIST_PREPEND(device_limits, c->io_device_limits, l); + } + + l->limits[type] = num; + + return 0; +} + +int config_parse_blockio_device_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupBlockIODeviceWeight *w; + CGroupContext *c = data; + const char *p = rvalue; + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->blockio_device_weights) + cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights); + + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device node and weight from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cg_blkio_weight_parse(p, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid block IO weight '%s', ignoring: %m", p); + return 0; + } + + assert(u != CGROUP_BLKIO_WEIGHT_INVALID); + + w = new0(CGroupBlockIODeviceWeight, 1); + if (!w) + return log_oom(); + + w->path = TAKE_PTR(resolved); + w->weight = u; + + LIST_PREPEND(device_weights, c->blockio_device_weights, w); + return 0; +} + +int config_parse_blockio_bandwidth( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *path = NULL, *resolved = NULL; + CGroupBlockIODeviceBandwidth *b = NULL, *t; + CGroupContext *c = data; + const char *p = rvalue; + uint64_t bytes; + bool read; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + read = streq("BlockIOReadBandwidth", lvalue); + + if (isempty(rvalue)) { + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { + b->rbps = CGROUP_LIMIT_MAX; + b->wbps = CGROUP_LIMIT_MAX; + } + return 0; + } + + r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue); + return 0; + } + + r = unit_full_printf(userdata, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in '%s', ignoring: %m", path); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = parse_size(p, 1000, &bytes); + if (r < 0 || bytes <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid Block IO Bandwidth '%s', ignoring.", p); + return 0; + } + + LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) { + if (path_equal(resolved, t->path)) { + b = t; + break; + } + } + + if (!t) { + b = new0(CGroupBlockIODeviceBandwidth, 1); + if (!b) + return log_oom(); + + b->path = TAKE_PTR(resolved); + b->rbps = CGROUP_LIMIT_MAX; + b->wbps = CGROUP_LIMIT_MAX; + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, b); + } + + if (read) + b->rbps = bytes; + else + b->wbps = bytes; + + return 0; +} + +int config_parse_job_mode_isolate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + JobMode *m = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean, ignoring: %s", rvalue); + return 0; + } + + log_notice("%s is deprecated. Please use OnFailureJobMode= instead", lvalue); + + *m = r ? JOB_ISOLATE : JOB_REPLACE; + return 0; +} + +int config_parse_exec_directories( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char***rt = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *rt = strv_free(*rt); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); + continue; + } + + r = path_simplify_and_warn(k, PATH_CHECK_RELATIVE, unit, filename, line, lvalue); + if (r < 0) + continue; + + if (path_startswith(k, "private")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s= path can't be 'private', ignoring assignment: %s", lvalue, word); + continue; + } + + r = strv_push(rt, k); + if (r < 0) + return log_oom(); + k = NULL; + } +} + +int config_parse_set_credential( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *word = NULL, *k = NULL, *unescaped = NULL; + ExecContext *context = data; + ExecSetCredential *old; + Unit *u = userdata; + const char *p; + int r, l; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(context); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + context->set_credentials = hashmap_free(context->set_credentials); + return 0; + } + + p = rvalue; + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0 || !p) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); + return 0; + } + if (!credential_name_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k); + return 0; + } + + /* We support escape codes here, so that users can insert trailing \n if they like */ + l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped); + if (l < 0) { + log_syntax(unit, LOG_WARNING, filename, line, l, "Can't unescape \"%s\", ignoring: %m", p); + return 0; + } + + old = hashmap_get(context->set_credentials, k); + if (old) { + free_and_replace(old->data, unescaped); + old->size = l; + } else { + _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL; + + sc = new0(ExecSetCredential, 1); + if (!sc) + return log_oom(); + + sc->id = TAKE_PTR(k); + sc->data = TAKE_PTR(unescaped); + sc->size = l; + + r = hashmap_ensure_allocated(&context->set_credentials, &exec_set_credential_hash_ops); + if (r < 0) + return r; + + r = hashmap_put(context->set_credentials, sc->id, sc); + if (r < 0) + return log_oom(); + + TAKE_PTR(sc); + } + + return 0; +} + +int config_parse_load_credential( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL; + ExecContext *context = data; + Unit *u = userdata; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(context); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + context->load_credentials = strv_free(context->load_credentials); + return 0; + } + + p = rvalue; + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = unit_full_printf(u, word, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); + return 0; + } + if (!credential_name_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k); + return 0; + } + r = unit_full_printf(u, p, &q); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p); + return 0; + } + if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Credential source \"%s\" not valid, ignoring.", q); + return 0; + } + + r = strv_consume_pair(&context->load_credentials, TAKE_PTR(k), TAKE_PTR(q)); + if (r < 0) + return log_oom(); + + return 0; +} + +int config_parse_set_status( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExitStatusSet *status_set = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(status_set); + + /* Empty assignment resets the list */ + if (isempty(rvalue)) { + exit_status_set_free(status_set); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + Bitmap *bitmap; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=%s, ignoring: %m", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + /* We need to call exit_status_from_string() first, because we want + * to parse numbers as exit statuses, not signals. */ + + r = exit_status_from_string(word); + if (r >= 0) { + assert(r >= 0 && r < 256); + bitmap = &status_set->status; + } else { + r = signal_from_string(word); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse value, ignoring: %s", word); + continue; + } + bitmap = &status_set->signal; + } + + r = bitmap_set(bitmap, r); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to set signal or status %s, ignoring: %m", word); + } +} + +int config_parse_namespace_path_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + char*** sv = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *sv = strv_free(*sv); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL; + const char *w; + bool ignore_enoent = false, shall_prefix = false; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + + w = word; + if (startswith(w, "-")) { + ignore_enoent = true; + w++; + } + if (startswith(w, "+")) { + shall_prefix = true; + w++; + } + + r = unit_full_printf(u, w, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", w); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + joined = strjoin(ignore_enoent ? "-" : "", + shall_prefix ? "+" : "", + resolved); + + r = strv_push(sv, joined); + if (r < 0) + return log_oom(); + + joined = NULL; + } + + return 0; +} + +int config_parse_temporary_filesystems( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = userdata; + ExecContext *c = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems); + c->temporary_filesystems = NULL; + c->n_temporary_filesystems = 0; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL, *path = NULL, *resolved = NULL; + const char *w; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue); + return 0; + } + + w = word; + r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", word); + continue; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", word); + continue; + } + + r = unit_full_printf(u, path, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", path); + continue; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, resolved, w); + if (r < 0) + return log_oom(); + } +} + +int config_parse_bind_paths( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + bind_mount_free_many(c->bind_mounts, c->n_bind_mounts); + c->bind_mounts = NULL; + c->n_bind_mounts = 0; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *source = NULL, *destination = NULL; + _cleanup_free_ char *sresolved = NULL, *dresolved = NULL; + char *s = NULL, *d = NULL; + bool rbind = true, ignore_enoent = false; + + r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + + r = unit_full_printf(u, source, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", source); + continue; + } + + s = sresolved; + if (s[0] == '-') { + ignore_enoent = true; + s++; + } + + r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + /* Optionally, the destination is specified. */ + if (p && p[-1] == ':') { + r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing argument after ':', ignoring: %s", s); + continue; + } + + r = unit_full_printf(u, destination, &dresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in \"%s\", ignoring: %m", destination); + continue; + } + + r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + d = dresolved; + + /* Optionally, there's also a short option string specified */ + if (p && p[-1] == ':') { + _cleanup_free_ char *options = NULL; + + r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s: %s", lvalue, rvalue); + return 0; + } + + if (isempty(options) || streq(options, "rbind")) + rbind = true; + else if (streq(options, "norbind")) + rbind = false; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid option string, ignoring setting: %s", options); + continue; + } + } + } else + d = s; + + r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts, + &(BindMount) { + .source = s, + .destination = d, + .read_only = !!strstr(lvalue, "ReadOnly"), + .recursive = rbind, + .ignore_enoent = ignore_enoent, + }); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int config_parse_mount_images( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = data; + const Unit *u = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL; + _cleanup_free_ char *sresolved = NULL, *dresolved = NULL; + const char *q = NULL; + char *s = NULL; + bool permissive = false; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + q = tuple; + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax in %s=, ignoring: %s", lvalue, tuple); + return 0; + } + if (r == 0) + continue; + + r = unit_full_printf(u, first, &sresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit specifiers in \"%s\", ignoring: %m", first); + continue; + } + + s = sresolved; + if (s[0] == '-') { + permissive = true; + s++; + } + + r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + if (isempty(second)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing destination in %s, ignoring: %s", lvalue, rvalue); + continue; + } + + r = unit_full_printf(u, second, &dresolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve specifiers in \"%s\", ignoring: %m", second); + continue; + } + + r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + for (;;) { + _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL; + MountOptions *o = NULL; + PartitionDesignator partition_designator; + + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q); + return 0; + } + if (r == 0) + break; + /* Single set of options, applying to the root partition/single filesystem */ + if (r == 1) { + r = unit_full_printf(u, partition, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", first); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = PARTITION_ROOT, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, o); + + break; + } + + partition_designator = partition_designator_from_string(partition); + if (partition_designator < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid partition name %s, ignoring", partition); + continue; + } + r = unit_full_printf(u, mount_options, &mount_options_resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options); + continue; + } + + o = new(MountOptions, 1); + if (!o) + return log_oom(); + *o = (MountOptions) { + .partition_designator = partition_designator, + .options = TAKE_PTR(mount_options_resolved), + }; + LIST_APPEND(mount_options, options, o); + } + + r = mount_image_add(&c->mount_images, &c->n_mount_images, + &(MountImage) { + .source = s, + .destination = dresolved, + .mount_options = options, + .ignore_enoent = permissive, + }); + if (r < 0) + return log_oom(); + } +} + +int config_parse_job_timeout_sec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = data; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobTimeoutSec= parameter, ignoring: %s", rvalue); + return 0; + } + + /* If the user explicitly changed JobTimeoutSec= also change JobRunningTimeoutSec=, for compatibility with old + * versions. If JobRunningTimeoutSec= was explicitly set, avoid this however as whatever the user picked should + * count. */ + + if (!u->job_running_timeout_set) + u->job_running_timeout = usec; + + u->job_timeout = usec; + + return 0; +} + +int config_parse_job_running_timeout_sec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Unit *u = data; + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + r = parse_sec_fix_0(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobRunningTimeoutSec= parameter, ignoring: %s", rvalue); + return 0; + } + + u->job_running_timeout = usec; + u->job_running_timeout_set = true; + + return 0; +} + +int config_parse_emergency_action( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *m = NULL; + EmergencyAction *x = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (unit) + m = ((Unit*) userdata)->manager; + else + m = data; + + r = parse_emergency_action(rvalue, MANAGER_IS_SYSTEM(m), x); + if (r < 0) { + if (r == -EOPNOTSUPP && MANAGER_IS_USER(m)) { + /* Compat mode: remove for systemd 241. */ + + log_syntax(unit, LOG_INFO, filename, line, r, + "%s= in user mode specified as \"%s\", using \"exit-force\" instead.", + lvalue, rvalue); + *x = EMERGENCY_ACTION_EXIT_FORCE; + return 0; + } + + if (r == -EOPNOTSUPP) + log_syntax(unit, LOG_WARNING, filename, line, r, + "%s= specified as %s mode action, ignoring: %s", + lvalue, MANAGER_IS_SYSTEM(m) ? "user" : "system", rvalue); + else + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_pid_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL, *n = NULL; + const Unit *u = userdata; + char **s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(u); + + if (isempty(rvalue)) { + /* An empty assignment removes already set value. */ + *s = mfree(*s); + return 0; + } + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + /* If this is a relative path make it absolute by prefixing the /run */ + n = path_make_absolute(k, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + if (!n) + return log_oom(); + + /* Check that the result is a sensible path */ + r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return r; + + r = patch_var_run(unit, filename, line, lvalue, &n); + if (r < 0) + return r; + + free_and_replace(*s, n); + return 0; +} + +int config_parse_exit_status( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *exit_status = data, r; + uint8_t u; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(exit_status); + + if (isempty(rvalue)) { + *exit_status = -1; + return 0; + } + + r = safe_atou8(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse exit status '%s', ignoring: %m", rvalue); + return 0; + } + + *exit_status = u; + return 0; +} + +int config_parse_disable_controllers( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + CGroupContext *c = data; + CGroupMask disabled_mask; + + /* 1. If empty, make all controllers eligible for use again. + * 2. If non-empty, merge all listed controllers, space separated. */ + + if (isempty(rvalue)) { + c->disable_controllers = 0; + return 0; + } + + r = cg_mask_from_string(rvalue, &disabled_mask); + if (r < 0 || disabled_mask <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue); + return 0; + } + + c->disable_controllers |= disabled_mask; + + return 0; +} + +int config_parse_ip_filter_bpf_progs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + const Unit *u = userdata; + char ***paths = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(paths); + + if (isempty(rvalue)) { + *paths = strv_free(*paths); + return 0; + } + + r = unit_full_printf(u, rvalue, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (strv_contains(*paths, resolved)) + return 0; + + r = strv_extend(paths, resolved); + if (r < 0) + return log_oom(); + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + static bool warned = false; + + log_full(warned ? LOG_DEBUG : LOG_WARNING, + "File %s:%u configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters.\n" + "Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue); + + warned = true; + } + + return 0; +} + +static int merge_by_names(Unit **u, Set *names, const char *id) { + char *k; + int r; + + assert(u); + assert(*u); + + /* Let's try to add in all names that are aliases of this unit */ + while ((k = set_steal_first(names))) { + _cleanup_free_ _unused_ char *free_k = k; + + /* First try to merge in the other name into our unit */ + r = unit_merge_by_name(*u, k); + if (r < 0) { + Unit *other; + + /* Hmm, we couldn't merge the other unit into ours? Then let's try it the other way + * round. */ + + other = manager_get_unit((*u)->manager, k); + if (!other) + return r; /* return previous failure */ + + r = unit_merge(other, *u); + if (r < 0) + return r; + + *u = other; + return merge_by_names(u, names, NULL); + } + + if (streq_ptr(id, k)) + unit_choose_id(*u, id); + } + + return 0; +} + +int unit_load_fragment(Unit *u) { + const char *fragment; + _cleanup_set_free_free_ Set *names = NULL; + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + assert(u->id); + + if (u->transient) { + u->load_state = UNIT_LOADED; + return 0; + } + + /* Possibly rebuild the fragment map to catch new units */ + r = unit_file_build_name_map(&u->manager->lookup_paths, + &u->manager->unit_cache_timestamp_hash, + &u->manager->unit_id_map, + &u->manager->unit_name_map, + &u->manager->unit_path_cache); + if (r < 0) + return log_error_errno(r, "Failed to rebuild name map: %m"); + + r = unit_file_find_fragment(u->manager->unit_id_map, + u->manager->unit_name_map, + u->id, + &fragment, + &names); + if (r < 0 && r != -ENOENT) + return r; + + if (fragment) { + /* Open the file, check if this is a mask, otherwise read. */ + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + + /* Try to open the file name. A symlink is OK, for example for linked files or masks. We + * expect that all symlinks within the lookup paths have been already resolved, but we don't + * verify this here. */ + f = fopen(fragment, "re"); + if (!f) + return log_unit_notice_errno(u, errno, "Failed to open %s: %m", fragment); + + if (fstat(fileno(f), &st) < 0) + return -errno; + + r = free_and_strdup(&u->fragment_path, fragment); + if (r < 0) + return r; + + if (null_or_empty(&st)) { + /* Unit file is masked */ + + u->load_state = u->perpetual ? UNIT_LOADED : UNIT_MASKED; /* don't allow perpetual units to ever be masked */ + u->fragment_mtime = 0; + } else { + u->load_state = UNIT_LOADED; + u->fragment_mtime = timespec_load(&st.st_mtim); + + /* Now, parse the file contents */ + r = config_parse(u->id, fragment, f, + UNIT_VTABLE(u)->sections, + config_item_perf_lookup, load_fragment_gperf_lookup, + 0, + u, + NULL); + if (r == -ENOEXEC) + log_unit_notice_errno(u, r, "Unit configuration has fatal error, unit will not be started."); + if (r < 0) + return r; + } + } + + /* We do the merge dance here because for some unit types, the unit might have aliases which are not + * declared in the file system. In particular, this is true (and frequent) for device and swap units. + */ + Unit *merged; + const char *id = u->id; + _cleanup_free_ char *free_id = NULL; + + if (fragment) { + id = basename(fragment); + if (unit_name_is_valid(id, UNIT_NAME_TEMPLATE)) { + assert(u->instance); /* If we're not trying to use a template for non-instanced unit, + * this must be set. */ + + r = unit_name_replace_instance(id, u->instance, &free_id); + if (r < 0) + return log_debug_errno(r, "Failed to build id (%s + %s): %m", id, u->instance); + id = free_id; + } + } + + merged = u; + r = merge_by_names(&merged, names, id); + if (r < 0) + return r; + + if (merged != u) + u->load_state = UNIT_MERGED; + + return 0; +} + +void unit_dump_config_items(FILE *f) { + static const struct { + const ConfigParserCallback callback; + const char *rvalue; + } table[] = { + { config_parse_warn_compat, "NOTSUPPORTED" }, + { config_parse_int, "INTEGER" }, + { config_parse_unsigned, "UNSIGNED" }, + { config_parse_iec_size, "SIZE" }, + { config_parse_iec_uint64, "SIZE" }, + { config_parse_si_uint64, "SIZE" }, + { config_parse_bool, "BOOLEAN" }, + { config_parse_string, "STRING" }, + { config_parse_path, "PATH" }, + { config_parse_unit_path_printf, "PATH" }, + { config_parse_strv, "STRING [...]" }, + { config_parse_exec_nice, "NICE" }, + { config_parse_exec_oom_score_adjust, "OOMSCOREADJUST" }, + { config_parse_exec_io_class, "IOCLASS" }, + { config_parse_exec_io_priority, "IOPRIORITY" }, + { config_parse_exec_cpu_sched_policy, "CPUSCHEDPOLICY" }, + { config_parse_exec_cpu_sched_prio, "CPUSCHEDPRIO" }, + { config_parse_exec_cpu_affinity, "CPUAFFINITY" }, + { config_parse_mode, "MODE" }, + { config_parse_unit_env_file, "FILE" }, + { config_parse_exec_output, "OUTPUT" }, + { config_parse_exec_input, "INPUT" }, + { config_parse_log_facility, "FACILITY" }, + { config_parse_log_level, "LEVEL" }, + { config_parse_exec_secure_bits, "SECUREBITS" }, + { config_parse_capability_set, "BOUNDINGSET" }, + { config_parse_rlimit, "LIMIT" }, + { config_parse_unit_deps, "UNIT [...]" }, + { config_parse_exec, "PATH [ARGUMENT [...]]" }, + { config_parse_service_type, "SERVICETYPE" }, + { config_parse_service_restart, "SERVICERESTART" }, + { config_parse_service_timeout_failure_mode, "TIMEOUTMODE" }, + { config_parse_kill_mode, "KILLMODE" }, + { config_parse_signal, "SIGNAL" }, + { config_parse_socket_listen, "SOCKET [...]" }, + { config_parse_socket_bind, "SOCKETBIND" }, + { config_parse_socket_bindtodevice, "NETWORKINTERFACE" }, + { config_parse_sec, "SECONDS" }, + { config_parse_nsec, "NANOSECONDS" }, + { config_parse_namespace_path_strv, "PATH [...]" }, + { config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" }, + { config_parse_unit_requires_mounts_for, "PATH [...]" }, + { config_parse_exec_mount_flags, "MOUNTFLAG [...]" }, + { config_parse_unit_string_printf, "STRING" }, + { config_parse_trigger_unit, "UNIT" }, + { config_parse_timer, "TIMER" }, + { config_parse_path_spec, "PATH" }, + { config_parse_notify_access, "ACCESS" }, + { config_parse_ip_tos, "TOS" }, + { config_parse_unit_condition_path, "CONDITION" }, + { config_parse_unit_condition_string, "CONDITION" }, + { config_parse_unit_slice, "SLICE" }, + { config_parse_documentation, "URL" }, + { config_parse_service_timeout, "SECONDS" }, + { config_parse_emergency_action, "ACTION" }, + { config_parse_set_status, "STATUS" }, + { config_parse_service_sockets, "SOCKETS" }, + { config_parse_environ, "ENVIRON" }, +#if HAVE_SECCOMP + { config_parse_syscall_filter, "SYSCALLS" }, + { config_parse_syscall_archs, "ARCHS" }, + { config_parse_syscall_errno, "ERRNO" }, + { config_parse_syscall_log, "SYSCALLS" }, + { config_parse_address_families, "FAMILIES" }, + { config_parse_restrict_namespaces, "NAMESPACES" }, +#endif + { config_parse_cpu_shares, "SHARES" }, + { config_parse_cg_weight, "WEIGHT" }, + { config_parse_memory_limit, "LIMIT" }, + { config_parse_device_allow, "DEVICE" }, + { config_parse_device_policy, "POLICY" }, + { config_parse_io_limit, "LIMIT" }, + { config_parse_io_device_weight, "DEVICEWEIGHT" }, + { config_parse_io_device_latency, "DEVICELATENCY" }, + { config_parse_blockio_bandwidth, "BANDWIDTH" }, + { config_parse_blockio_weight, "WEIGHT" }, + { config_parse_blockio_device_weight, "DEVICEWEIGHT" }, + { config_parse_long, "LONG" }, + { config_parse_socket_service, "SERVICE" }, +#if HAVE_SELINUX + { config_parse_exec_selinux_context, "LABEL" }, +#endif + { config_parse_job_mode, "MODE" }, + { config_parse_job_mode_isolate, "BOOLEAN" }, + { config_parse_personality, "PERSONALITY" }, + }; + + const char *prev = NULL; + const char *i; + + assert(f); + + NULSTR_FOREACH(i, load_fragment_gperf_nulstr) { + const char *rvalue = "OTHER", *lvalue; + const ConfigPerfItem *p; + const char *dot; + + assert_se(p = load_fragment_gperf_lookup(i, strlen(i))); + + /* Hide legacy settings */ + if (p->parse == config_parse_warn_compat && + p->ltype == DISABLED_LEGACY) + continue; + + for (size_t j = 0; j < ELEMENTSOF(table); j++) + if (p->parse == table[j].callback) { + rvalue = table[j].rvalue; + break; + } + + dot = strchr(i, '.'); + lvalue = dot ? dot + 1 : i; + + if (dot) { + size_t prefix_len = dot - i; + + if (!prev || !strneq(prev, i, prefix_len+1)) { + if (prev) + fputc('\n', f); + + fprintf(f, "[%.*s]\n", (int) prefix_len, i); + } + } + + fprintf(f, "%s=%s\n", lvalue, rvalue); + prev = i; + } +} + +int config_parse_cpu_affinity2( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CPUSet *affinity = data; + + assert(affinity); + + (void) parse_cpu_set_extend(rvalue, affinity, true, unit, filename, line, lvalue); + + return 0; +} + +int config_parse_show_status( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int k; + ShowStatus *b = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + k = parse_show_status(rvalue, b); + if (k < 0) + log_syntax(unit, LOG_WARNING, filename, line, k, "Failed to parse show status setting, ignoring: %s", rvalue); + + return 0; +} + +int config_parse_output_restricted( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecOutput t, *eo = data; + bool obsolete = false; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(rvalue, "syslog")) { + t = EXEC_OUTPUT_JOURNAL; + obsolete = true; + } else if (streq(rvalue, "syslog+console")) { + t = EXEC_OUTPUT_JOURNAL_AND_CONSOLE; + obsolete = true; + } else { + t = exec_output_from_string(rvalue); + if (t < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse output type, ignoring: %s", rvalue); + return 0; + } + + if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Standard output types socket, fd:, file:, append: are not supported as defaults, ignoring: %s", rvalue); + return 0; + } + } + + if (obsolete) + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "Standard output type %s is obsolete, automatically updating to %s. Please update your configuration.", + rvalue, exec_output_to_string(t)); + + *eo = t; + return 0; +} + +int config_parse_crash_chvt( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = parse_crash_chvt(rvalue, data); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashChangeVT= setting, ignoring: %s", rvalue); + + return 0; +} + +int config_parse_swap_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Swap *s = userdata; + int r, priority; + + assert(s); + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + s->parameters_fragment.priority = -1; + s->parameters_fragment.priority_set = false; + return 0; + } + + r = safe_atoi(rvalue, &priority); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid swap priority '%s', ignoring.", rvalue); + return 0; + } + + if (priority < -1) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Sorry, swap priorities smaller than -1 may only be assigned by the kernel itself, ignoring: %s", rvalue); + return 0; + } + + if (priority > 32767) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Swap priority out of range, ignoring: %s", rvalue); + return 0; + } + + s->parameters_fragment.priority = priority; + s->parameters_fragment.priority_set = true; + return 0; +} diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h new file mode 100644 index 0000000..6b2175c --- /dev/null +++ b/src/core/load-fragment.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "unit.h" + +/* Config-parsing helpers relevant only for sources under src/core/ */ +int parse_crash_chvt(const char *value, int *data); +int parse_confirm_spawn(const char *value, char **console); + +/* Read service data from .desktop file style configuration fragments */ + +int unit_load_fragment(Unit *u); + +void unit_dump_config_items(FILE *f); + +CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps); +CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_strv_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_documentation); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_listen); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_protocol); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_bind); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_coredump_filter); +CONFIG_PARSER_PROTOTYPE(config_parse_exec); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_abort); +CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_failure_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_service_type); +CONFIG_PARSER_PROTOTYPE(config_parse_service_restart); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_output); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_text); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_data); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_class); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_prio); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_affinity); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_apivfs); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_secure_bits); +CONFIG_PARSER_PROTOTYPE(config_parse_root_image_options); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash_sig); +CONFIG_PARSER_PROTOTYPE(config_parse_capability_set); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_flags); +CONFIG_PARSER_PROTOTYPE(config_parse_timer); +CONFIG_PARSER_PROTOTYPE(config_parse_trigger_unit); +CONFIG_PARSER_PROTOTYPE(config_parse_path_spec); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_service); +CONFIG_PARSER_PROTOTYPE(config_parse_service_sockets); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_env_file); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_tos); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_path); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string); +CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_notify_access); +CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_log); +CONFIG_PARSER_PROTOTYPE(config_parse_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_pass_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_unset_environ); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_slice); +CONFIG_PARSER_PROTOTYPE(config_parse_cg_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares); +CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); +CONFIG_PARSER_PROTOTYPE(config_parse_delegate); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); +CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency); +CONFIG_PARSER_PROTOTYPE(config_parse_io_device_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_io_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_device_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_blockio_bandwidth); +CONFIG_PARSER_PROTOTYPE(config_parse_job_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_job_mode_isolate); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_selinux_context); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_apparmor_profile); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_smack_process_label); +CONFIG_PARSER_PROTOTYPE(config_parse_address_families); +CONFIG_PARSER_PROTOTYPE(config_parse_runtime_preserve_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_directories); +CONFIG_PARSER_PROTOTYPE(config_parse_set_credential); +CONFIG_PARSER_PROTOTYPE(config_parse_load_credential); +CONFIG_PARSER_PROTOTYPE(config_parse_set_status); +CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); +CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); +CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpus); +CONFIG_PARSER_PROTOTYPE(config_parse_allowed_mems); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_home); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_system); +CONFIG_PARSER_PROTOTYPE(config_parse_bus_name); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_utmp_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_working_directory); +CONFIG_PARSER_PROTOTYPE(config_parse_fdname); +CONFIG_PARSER_PROTOTYPE(config_parse_sec_fix_0); +CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat); +CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat); +CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces); +CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths); +CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc); +CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset); +CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); +CONFIG_PARSER_PROTOTYPE(config_parse_log_namespace); +CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_pid_file); +CONFIG_PARSER_PROTOTYPE(config_parse_exit_status); +CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers); +CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity2); +CONFIG_PARSER_PROTOTYPE(config_parse_show_status); +CONFIG_PARSER_PROTOTYPE(config_parse_status_unit_format); +CONFIG_PARSER_PROTOTYPE(config_parse_output_restricted); +CONFIG_PARSER_PROTOTYPE(config_parse_crash_chvt); +CONFIG_PARSER_PROTOTYPE(config_parse_timeout_abort); +CONFIG_PARSER_PROTOTYPE(config_parse_swap_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_mount_images); +CONFIG_PARSER_PROTOTYPE(config_parse_socket_timestamping); + +/* gperf prototypes */ +const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); +extern const char load_fragment_gperf_nulstr[]; diff --git a/src/core/locale-setup.c b/src/core/locale-setup.c new file mode 100644 index 0000000..64761dd --- /dev/null +++ b/src/core/locale-setup.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdlib.h> + +#include "env-file.h" +#include "env-util.h" +#include "locale-setup.h" +#include "locale-util.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "strv.h" +#include "util.h" +#include "virt.h" + +int locale_setup(char ***environment) { + _cleanup_(locale_variables_freep) char *variables[_VARIABLE_LC_MAX] = {}; + _cleanup_strv_free_ char **add = NULL; + LocaleVariable i; + int r; + + r = proc_cmdline_get_key_many(PROC_CMDLINE_STRIP_RD_PREFIX, + "locale.LANG", &variables[VARIABLE_LANG], + "locale.LANGUAGE", &variables[VARIABLE_LANGUAGE], + "locale.LC_CTYPE", &variables[VARIABLE_LC_CTYPE], + "locale.LC_NUMERIC", &variables[VARIABLE_LC_NUMERIC], + "locale.LC_TIME", &variables[VARIABLE_LC_TIME], + "locale.LC_COLLATE", &variables[VARIABLE_LC_COLLATE], + "locale.LC_MONETARY", &variables[VARIABLE_LC_MONETARY], + "locale.LC_MESSAGES", &variables[VARIABLE_LC_MESSAGES], + "locale.LC_PAPER", &variables[VARIABLE_LC_PAPER], + "locale.LC_NAME", &variables[VARIABLE_LC_NAME], + "locale.LC_ADDRESS", &variables[VARIABLE_LC_ADDRESS], + "locale.LC_TELEPHONE", &variables[VARIABLE_LC_TELEPHONE], + "locale.LC_MEASUREMENT", &variables[VARIABLE_LC_MEASUREMENT], + "locale.LC_IDENTIFICATION", &variables[VARIABLE_LC_IDENTIFICATION]); + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Failed to read /proc/cmdline: %m"); + + /* Hmm, nothing set on the kernel cmd line? Then let's try /etc/locale.conf */ + if (r <= 0) { + r = parse_env_file(NULL, "/etc/locale.conf", + "LANG", &variables[VARIABLE_LANG], + "LANGUAGE", &variables[VARIABLE_LANGUAGE], + "LC_CTYPE", &variables[VARIABLE_LC_CTYPE], + "LC_NUMERIC", &variables[VARIABLE_LC_NUMERIC], + "LC_TIME", &variables[VARIABLE_LC_TIME], + "LC_COLLATE", &variables[VARIABLE_LC_COLLATE], + "LC_MONETARY", &variables[VARIABLE_LC_MONETARY], + "LC_MESSAGES", &variables[VARIABLE_LC_MESSAGES], + "LC_PAPER", &variables[VARIABLE_LC_PAPER], + "LC_NAME", &variables[VARIABLE_LC_NAME], + "LC_ADDRESS", &variables[VARIABLE_LC_ADDRESS], + "LC_TELEPHONE", &variables[VARIABLE_LC_TELEPHONE], + "LC_MEASUREMENT", &variables[VARIABLE_LC_MEASUREMENT], + "LC_IDENTIFICATION", &variables[VARIABLE_LC_IDENTIFICATION]); + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Failed to read /etc/locale.conf: %m"); + } + + for (i = 0; i < _VARIABLE_LC_MAX; i++) { + char *s; + + if (!variables[i]) + continue; + + s = strjoin(locale_variable_to_string(i), "=", variables[i]); + if (!s) + return -ENOMEM; + + if (strv_consume(&add, s) < 0) + return -ENOMEM; + } + + if (strv_isempty(add)) { + /* If no locale is configured then default to compile-time default. */ + + add = strv_new("LANG=" SYSTEMD_DEFAULT_LOCALE); + if (!add) + return -ENOMEM; + } + + if (strv_isempty(*environment)) + strv_free_and_replace(*environment, add); + else { + char **merged; + + merged = strv_env_merge(2, *environment, add); + if (!merged) + return -ENOMEM; + + strv_free_and_replace(*environment, merged); + } + + return 0; +} diff --git a/src/core/locale-setup.h b/src/core/locale-setup.h new file mode 100644 index 0000000..d554ad3 --- /dev/null +++ b/src/core/locale-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int locale_setup(char ***environment); diff --git a/src/core/loopback-setup.c b/src/core/loopback-setup.c new file mode 100644 index 0000000..76022ce --- /dev/null +++ b/src/core/loopback-setup.c @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <net/if.h> +#include <stdlib.h> + +#include "sd-netlink.h" + +#include "loopback-setup.h" +#include "missing_network.h" +#include "netlink-util.h" +#include "time-util.h" + +#define LOOPBACK_SETUP_TIMEOUT_USEC (5 * USEC_PER_SEC) + +struct state { + unsigned n_messages; + int rcode; + const char *error_message; + const char *success_message; +}; + +static int generic_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + struct state *s = userdata; + int r; + + assert(s); + assert(s->n_messages > 0); + s->n_messages--; + + errno = 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0) + log_debug_errno(r, "%s: %m", s->error_message); + else + log_debug("%s", s->success_message); + + s->rcode = r; + return 0; +} + +static int start_loopback(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_SETLINK, LOOPBACK_IFINDEX); + if (r < 0) + return r; + + r = sd_rtnl_message_link_set_flags(req, IFF_UP, IFF_UP); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, LOOPBACK_SETUP_TIMEOUT_USEC, "systemd-start-loopback"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int add_ipv4_address(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_prefixlen(req, 8); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_flags(req, IFA_F_PERMANENT); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(req, IFA_LOCAL, &(struct in_addr) { .s_addr = htobe32(INADDR_LOOPBACK) } ); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv4"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int add_ipv6_address(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET6); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_prefixlen(req, 128); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_flags(req, IFA_F_PERMANENT); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(req, IFA_LOCAL, &in6addr_loopback); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv6"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static bool check_loopback(sd_netlink *rtnl) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + unsigned flags; + int r; + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, LOOPBACK_IFINDEX); + if (r < 0) + return false; + + r = sd_netlink_call(rtnl, req, USEC_INFINITY, &reply); + if (r < 0) + return false; + + r = sd_rtnl_message_link_get_flags(reply, &flags); + if (r < 0) + return false; + + return flags & IFF_UP; +} + +int loopback_setup(void) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + struct state state_4 = { + .error_message = "Failed to add address 127.0.0.1 to loopback interface", + .success_message = "Successfully added address 127.0.0.1 to loopback interface", + }, state_6 = { + .error_message = "Failed to add address ::1 to loopback interface", + .success_message = "Successfully added address ::1 to loopback interface", + }, state_up = { + .error_message = "Failed to bring loopback interface up", + .success_message = "Successfully brought loopback interface up", + }; + int r; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to open netlink: %m"); + + /* Note that we add the IP addresses here explicitly even though the kernel does that too implicitly when + * setting up the loopback device. The reason we do this here a second time (and possibly race against the + * kernel) is that we want to synchronously wait until the IP addresses are set up correctly, see + * + * https://github.com/systemd/systemd/issues/5641 */ + + r = add_ipv4_address(rtnl, &state_4); + if (r < 0) + return log_error_errno(r, "Failed to enqueue IPv4 loopback address add request: %m"); + + r = add_ipv6_address(rtnl, &state_6); + if (r < 0) + return log_error_errno(r, "Failed to enqueue IPv6 loopback address add request: %m"); + + r = start_loopback(rtnl, &state_up); + if (r < 0) + return log_error_errno(r, "Failed to enqueue loopback interface start request: %m"); + + while (state_4.n_messages + state_6.n_messages + state_up.n_messages > 0) { + r = sd_netlink_wait(rtnl, LOOPBACK_SETUP_TIMEOUT_USEC); + if (r < 0) + return log_error_errno(r, "Failed to wait for netlink event: %m"); + + r = sd_netlink_process(rtnl, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to process netlink event: %m"); + } + + /* Note that we don't really care whether the addresses could be added or not */ + if (state_up.rcode != 0) { + /* If we lack the permissions to configure the loopback device, + * but we find it to be already configured, let's exit cleanly, + * in order to supported unprivileged containers. */ + if (state_up.rcode == -EPERM && check_loopback(rtnl)) + return 0; + + return log_warning_errno(state_up.rcode, "Failed to configure loopback device: %m"); + } + + return 0; +} diff --git a/src/core/loopback-setup.h b/src/core/loopback-setup.h new file mode 100644 index 0000000..a7ee2da --- /dev/null +++ b/src/core/loopback-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int loopback_setup(void); diff --git a/src/core/machine-id-setup.c b/src/core/machine-id-setup.c new file mode 100644 index 0000000..6d15f9c --- /dev/null +++ b/src/core/machine-id-setup.c @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <sched.h> +#include <sys/mount.h> +#include <unistd.h> + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "log.h" +#include "machine-id-setup.h" +#include "macro.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "umask-util.h" +#include "util.h" +#include "virt.h" + +static int generate_machine_id(const char *root, sd_id128_t *ret) { + const char *dbus_machine_id; + _cleanup_close_ int fd = -1; + int r; + + assert(ret); + + /* First, try reading the D-Bus machine id, unless it is a symlink */ + dbus_machine_id = prefix_roota(root, "/var/lib/dbus/machine-id"); + fd = open(dbus_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd >= 0) { + if (id128_read_fd(fd, ID128_PLAIN, ret) >= 0) { + log_info("Initializing machine ID from D-Bus machine ID."); + return 0; + } + + fd = safe_close(fd); + } + + if (isempty(root) && running_in_chroot() <= 0) { + /* If that didn't work, see if we are running in a container, + * and a machine ID was passed in via $container_uuid the way + * libvirt/LXC does it */ + + if (detect_container() > 0) { + _cleanup_free_ char *e = NULL; + + if (getenv_for_pid(1, "container_uuid", &e) > 0 && + sd_id128_from_string(e, ret) >= 0) { + log_info("Initializing machine ID from container UUID."); + return 0; + } + + } else if (detect_vm() == VIRTUALIZATION_KVM) { + + /* If we are not running in a container, see if we are + * running in qemu/kvm and a machine ID was passed in + * via -uuid on the qemu/kvm command line */ + + if (id128_read("/sys/class/dmi/id/product_uuid", ID128_UUID, ret) >= 0) { + log_info("Initializing machine ID from KVM UUID."); + return 0; + } + /* on POWER, it's exported here instead */ + if (id128_read("/sys/firmware/devicetree/base/vm,uuid", ID128_UUID, ret) >= 0) { + log_info("Initializing machine ID from KVM UUID."); + return 0; + } + } + } + + /* If that didn't work, generate a random machine id */ + r = sd_id128_randomize(ret); + if (r < 0) + return log_error_errno(r, "Failed to generate randomized machine ID: %m"); + + log_info("Initializing machine ID from random generator."); + return 0; +} + +int machine_id_setup(const char *root, bool force_transient, sd_id128_t machine_id, sd_id128_t *ret) { + const char *etc_machine_id, *run_machine_id; + _cleanup_close_ int fd = -1; + bool writable; + int r; + + etc_machine_id = prefix_roota(root, "/etc/machine-id"); + + RUN_WITH_UMASK(0000) { + /* We create this 0444, to indicate that this isn't really + * something you should ever modify. Of course, since the file + * will be owned by root it doesn't matter much, but maybe + * people look. */ + + (void) mkdir_parents(etc_machine_id, 0755); + fd = open(etc_machine_id, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY, 0444); + if (fd < 0) { + int old_errno = errno; + + fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + if (old_errno == EROFS && errno == ENOENT) + return log_error_errno(errno, + "System cannot boot: Missing /etc/machine-id and /etc is mounted read-only.\n" + "Booting up is supported only when:\n" + "1) /etc/machine-id exists and is populated.\n" + "2) /etc/machine-id exists and is empty.\n" + "3) /etc/machine-id is missing and /etc is writable.\n"); + else + return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id); + } + + writable = false; + } else + writable = true; + } + + /* A we got a valid machine ID argument, that's what counts */ + if (sd_id128_is_null(machine_id)) { + + /* Try to read any existing machine ID */ + if (id128_read_fd(fd, ID128_PLAIN, ret) >= 0) + return 0; + + /* Hmm, so, the id currently stored is not useful, then let's generate one */ + r = generate_machine_id(root, &machine_id); + if (r < 0) + return r; + } + + if (writable) { + if (lseek(fd, 0, SEEK_SET) == (off_t) -1) + return log_error_errno(errno, "Failed to seek %s: %m", etc_machine_id); + + if (ftruncate(fd, 0) < 0) + return log_error_errno(errno, "Failed to truncate %s: %m", etc_machine_id); + + /* If the caller requested a transient machine-id, write the string "uninitialized\n" to + * disk and overmount it with a transient file. + * + * Otherwise write the machine-id directly to disk. */ + if (force_transient) { + r = loop_write(fd, "uninitialized\n", strlen("uninitialized\n"), false); + if (r < 0) + return log_error_errno(r, "Failed to write uninitialized %s: %m", etc_machine_id); + + r = fsync_full(fd); + if (r < 0) + return log_error_errno(r, "Failed to sync %s: %m", etc_machine_id); + } else { + r = id128_write_fd(fd, ID128_PLAIN, machine_id, true); + if (r < 0) + return log_error_errno(r, "Failed to write %s: %m", etc_machine_id); + else + goto finish; + } + } + + fd = safe_close(fd); + + /* Hmm, we couldn't or shouldn't write the machine-id to /etc? + * So let's write it to /run/machine-id as a replacement */ + + run_machine_id = prefix_roota(root, "/run/machine-id"); + + RUN_WITH_UMASK(0022) + r = id128_write(run_machine_id, ID128_PLAIN, machine_id, false); + if (r < 0) { + (void) unlink(run_machine_id); + return log_error_errno(r, "Cannot write %s: %m", run_machine_id); + } + + /* And now, let's mount it over */ + r = mount_follow_verbose(LOG_ERR, run_machine_id, etc_machine_id, NULL, MS_BIND, NULL); + if (r < 0) { + (void) unlink(run_machine_id); + return r; + } + + log_full(force_transient ? LOG_DEBUG : LOG_INFO, "Installed transient %s file.", etc_machine_id); + + /* Mark the mount read-only */ + r = mount_follow_verbose(LOG_WARNING, NULL, etc_machine_id, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL); + if (r < 0) + return r; + +finish: + if (ret) + *ret = machine_id; + + return 0; +} + +int machine_id_commit(const char *root) { + _cleanup_close_ int fd = -1, initial_mntns_fd = -1; + const char *etc_machine_id, *sync_path; + sd_id128_t id; + int r; + + /* Before doing anything, sync everything to ensure any changes by first-boot units are persisted. + * + * First, explicitly sync the file systems we care about and check if it worked. */ + FOREACH_STRING(sync_path, "/etc/", "/var/") { + r = syncfs_path(AT_FDCWD, sync_path); + if (r < 0) + return log_error_errno(r, "Cannot sync %s: %m", sync_path); + } + + /* Afterwards, sync() the rest too, but we can't check the return value for these. */ + sync(); + + /* Replaces a tmpfs bind mount of /etc/machine-id by a proper file, atomically. For this, the umount is removed + * in a mount namespace, a new file is created at the right place. Afterwards the mount is also removed in the + * original mount namespace, thus revealing the file that was just created. */ + + etc_machine_id = prefix_roota(root, "/etc/machine-id"); + + r = path_is_mount_point(etc_machine_id, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", etc_machine_id); + if (r == 0) { + log_debug("%s is not a mount point. Nothing to do.", etc_machine_id); + return 0; + } + + /* Read existing machine-id */ + fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id); + + r = fd_is_temporary_fs(fd); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is on a temporary file system: %m", etc_machine_id); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EROFS), + "%s is not on a temporary file system.", + etc_machine_id); + + r = id128_read_fd(fd, ID128_PLAIN, &id); + if (r < 0) + return log_error_errno(r, "We didn't find a valid machine ID in %s: %m", etc_machine_id); + + fd = safe_close(fd); + + /* Store current mount namespace */ + r = namespace_open(0, NULL, &initial_mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Can't fetch current mount namespace: %m"); + + /* Switch to a new mount namespace, isolate ourself and unmount etc_machine_id in our new namespace */ + r = detach_mount_namespace(); + if (r < 0) + return log_error_errno(r, "Failed to set up new mount namespace: %m"); + + r = umount_verbose(LOG_ERR, etc_machine_id, 0); + if (r < 0) + return r; + + /* Update a persistent version of etc_machine_id */ + r = id128_write(etc_machine_id, ID128_PLAIN, id, true); + if (r < 0) + return log_error_errno(r, "Cannot write %s. This is mandatory to get a persistent machine ID: %m", etc_machine_id); + + /* Return to initial namespace and proceed a lazy tmpfs unmount */ + r = namespace_enter(-1, initial_mntns_fd, -1, -1, -1); + if (r < 0) + return log_warning_errno(r, "Failed to switch back to initial mount namespace: %m.\nWe'll keep transient %s file until next reboot.", etc_machine_id); + + if (umount2(etc_machine_id, MNT_DETACH) < 0) + return log_warning_errno(errno, "Failed to unmount transient %s file: %m.\nWe keep that mount until next reboot.", etc_machine_id); + + return 0; +} diff --git a/src/core/machine-id-setup.h b/src/core/machine-id-setup.h new file mode 100644 index 0000000..cce5819 --- /dev/null +++ b/src/core/machine-id-setup.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +int machine_id_commit(const char *root); +int machine_id_setup(const char *root, bool force_transient, sd_id128_t requested, sd_id128_t *ret); diff --git a/src/core/macros.systemd.in b/src/core/macros.systemd.in new file mode 100644 index 0000000..1c40328 --- /dev/null +++ b/src/core/macros.systemd.in @@ -0,0 +1,165 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. + +# RPM macros for packages installing systemd unit files + +%_systemd_util_dir @rootlibexecdir@ +%_unitdir @systemunitdir@ +%_userunitdir @userunitdir@ +%_presetdir @systempresetdir@ +%_userpresetdir @userpresetdir@ +%_udevhwdbdir @udevhwdbdir@ +%_udevrulesdir @udevrulesdir@ +%_journalcatalogdir @catalogdir@ +%_binfmtdir @binfmtdir@ +%_sysctldir @sysctldir@ +%_sysusersdir @sysusersdir@ +%_tmpfilesdir @tmpfilesdir@ +%_environmentdir @environmentdir@ +%_modulesloaddir @modulesloaddir@ +%_modprobedir @modprobedir@ +%_systemdgeneratordir @systemgeneratordir@ +%_systemdusergeneratordir @usergeneratordir@ +%_systemd_system_env_generator_dir @systemenvgeneratordir@ +%_systemd_user_env_generator_dir @userenvgeneratordir@ + +# Because we had one release with a typo... +# This is temporary (Remove after systemd 240 is released) +%_environmnentdir %{warn:Use %%_environmentdir instead}%_environmentdir + +%systemd_requires \ +Requires(post): systemd \ +Requires(preun): systemd \ +Requires(postun): systemd \ +%{nil} + +%systemd_ordering \ +OrderWithRequires(post): systemd \ +OrderWithRequires(preun): systemd \ +OrderWithRequires(postun): systemd \ +%{nil} + +%__systemd_someargs_0(:) %{error:The %%%1 macro requires some arguments} +%__systemd_twoargs_2() %{nil} + +%systemd_post() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_post}} \ +if [ $1 -eq 1 ] && [ -x @bindir@/systemctl ]; then \ + # Initial installation \ + @bindir@/systemctl --no-reload preset %{?*} || : \ +fi \ +%{nil} + +%systemd_user_post() %{expand:%systemd_post \\--global %%{?*}} + +%systemd_preun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_preun}} \ +if [ $1 -eq 0 ] && [ -x @bindir@/systemctl ]; then \ + # Package removal, not upgrade \ + @bindir@/systemctl --no-reload disable --now %{?*} || : \ +fi \ +%{nil} + +%systemd_user_preun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_preun}} \ +if [ $1 -eq 0 ] && [ -x @bindir@/systemctl ]; then \ + # Package removal, not upgrade \ + @bindir@/systemctl --global disable %{?*} || : \ +fi \ +%{nil} + +%systemd_postun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_postun}} \ +%{nil} + +%systemd_user_postun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_postun}} \ +%{nil} + +%systemd_postun_with_restart() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_postun_with_restart}} \ +if [ $1 -ge 1 ] && [ -x @bindir@/systemctl ]; then \ + # Package upgrade, not uninstall \ + @bindir@/systemctl try-restart %{?*} || : \ +fi \ +%{nil} + +%systemd_user_postun_with_restart() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_postun_with_restart}} \ +%{nil} + +%udev_hwdb_update() %{nil} + +%udev_rules_update() %{nil} + +%journal_catalog_update() %{nil} + +# Deprecated. Use %tmpfiles_create_package instead +%tmpfiles_create() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# tmpfiles_create}} \ +[ -x @bindir@/systemd-tmpfiles ] && @bindir@/systemd-tmpfiles --create %{?*} || : \ +%{nil} + +# Deprecated. Use %sysusers_create_package instead +%sysusers_create() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# sysusers_create}} \ +[ -x @bindir@/systemd-sysusers ] && @bindir@/systemd-sysusers %{?*} || : \ +%{nil} + +%sysusers_create_inline() \ +[ -x @bindir@/systemd-sysusers ] && @bindir@/systemd-sysusers - <<SYSTEMD_INLINE_EOF || : \ +%{?*} \ +SYSTEMD_INLINE_EOF\ +%{nil} + +# This should be used by package installation scripts which require users or +# groups to be present before the files installed by the package are present on +# disk (for example because some files are owned by those users or groups). +# +# Example: +# Source1: %{name}-sysusers.conf +# ... +# %install +# install -D %SOURCE1 %{buildroot}%{_sysusersdir}/%{name}.conf +# %pre +# %sysusers_create_package %{name} %SOURCE1 +# %files +# %{_sysusersdir}/%{name}.conf +%sysusers_create_package() \ +%{expand:%%{?!__systemd_twoargs_%#:%%{error:The %%%%sysusers_create_package macro requires two arguments}}} \ +systemd-sysusers --replace=%_sysusersdir/%1.conf - <<SYSTEMD_INLINE_EOF || : \ +%(cat %2) \ +SYSTEMD_INLINE_EOF\ +%{nil} + +# This may be used by package installation scripts to create files according to +# their tmpfiles configuration from a package installation script, even before +# the files of that package are installed on disk. +# +# Example: +# Source1: %{name}-tmpfiles.conf +# ... +# %install +# install -D %SOURCE1 %{buildroot}%{_tmpfilesdir}/%{name}.conf +# %pre +# %tmpfiles_create_package %{name} %SOURCE1 +# %files +# %{_tmpfilesdir}/%{name}.conf +%tmpfiles_create_package() \ +%{expand:%%{?!__systemd_twoargs_%#:%%{error:The %%%%tmpfiles_create_package macro requires two arguments}}} \ +systemd-tmpfiles --replace=%_tmpfilesdir/%1.conf --create - <<SYSTEMD_INLINE_EOF || : \ +%(cat %2) \ +SYSTEMD_INLINE_EOF\ +%{nil} + +%sysctl_apply() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# sysctl_apply}} \ +[ -x @rootlibexecdir@/systemd-sysctl ] && @rootlibexecdir@/systemd-sysctl %{?*} || : \ +%{nil} + +%binfmt_apply() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# binfmt_apply}} \ +[ -x @rootlibexecdir@/systemd-binfmt ] && @rootlibexecdir@/systemd-binfmt %{?*} || : \ +%{nil} diff --git a/src/core/main.c b/src/core/main.c new file mode 100644 index 0000000..a280b75 --- /dev/null +++ b/src/core/main.c @@ -0,0 +1,2935 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <sys/mount.h> +#include <sys/prctl.h> +#include <sys/reboot.h> +#include <unistd.h> +#if HAVE_SECCOMP +#include <seccomp.h> +#endif +#if HAVE_VALGRIND_VALGRIND_H +#include <valgrind/valgrind.h> +#endif + +#include "sd-bus.h" +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "apparmor-setup.h" +#include "architecture.h" +#include "build.h" +#include "bus-error.h" +#include "bus-util.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "clock-util.h" +#include "conf-parser.h" +#include "cpu-set-util.h" +#include "dbus-manager.h" +#include "dbus.h" +#include "def.h" +#include "dev-setup.h" +#include "efi-random.h" +#include "efivars.h" +#include "emergency-action.h" +#include "env-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fdset.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "hostname-setup.h" +#include "ima-setup.h" +#include "killall.h" +#include "kmod-setup.h" +#include "limits-util.h" +#include "load-fragment.h" +#include "log.h" +#include "loopback-setup.h" +#include "machine-id-setup.h" +#include "manager.h" +#include "mkdir.h" +#include "mount-setup.h" +#include "os-util.h" +#include "pager.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "random-util.h" +#include "raw-clone.h" +#include "rlimit-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "selinux-setup.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "smack-setup.h" +#include "special.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "switch-root.h" +#include "sysctl-util.h" +#include "terminal-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "util.h" +#include "virt.h" +#include "watchdog.h" + +#if HAS_FEATURE_ADDRESS_SANITIZER +#include <sanitizer/lsan_interface.h> +#endif + +#define DEFAULT_TASKS_MAX ((TasksMax) { 15U, 100U }) /* 15% */ + +static enum { + ACTION_RUN, + ACTION_HELP, + ACTION_VERSION, + ACTION_TEST, + ACTION_DUMP_CONFIGURATION_ITEMS, + ACTION_DUMP_BUS_PROPERTIES, + ACTION_BUS_INTROSPECT, +} arg_action = ACTION_RUN; + +static const char *arg_bus_introspect = NULL; + +/* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real + * defaults are assigned in reset_arguments() below. */ +static char *arg_default_unit; +static bool arg_system; +static bool arg_dump_core; +static int arg_crash_chvt; +static bool arg_crash_shell; +static bool arg_crash_reboot; +static char *arg_confirm_spawn; +static ShowStatus arg_show_status; +static StatusUnitFormat arg_status_unit_format; +static bool arg_switched_root; +static PagerFlags arg_pager_flags; +static bool arg_service_watchdogs; +static ExecOutput arg_default_std_output; +static ExecOutput arg_default_std_error; +static usec_t arg_default_restart_usec; +static usec_t arg_default_timeout_start_usec; +static usec_t arg_default_timeout_stop_usec; +static usec_t arg_default_timeout_abort_usec; +static bool arg_default_timeout_abort_set; +static usec_t arg_default_start_limit_interval; +static unsigned arg_default_start_limit_burst; +static usec_t arg_runtime_watchdog; +static usec_t arg_reboot_watchdog; +static usec_t arg_kexec_watchdog; +static char *arg_early_core_pattern; +static char *arg_watchdog_device; +static char **arg_default_environment; +static struct rlimit *arg_default_rlimit[_RLIMIT_MAX]; +static uint64_t arg_capability_bounding_set; +static bool arg_no_new_privs; +static nsec_t arg_timer_slack_nsec; +static usec_t arg_default_timer_accuracy_usec; +static Set* arg_syscall_archs; +static FILE* arg_serialization; +static int arg_default_cpu_accounting; +static bool arg_default_io_accounting; +static bool arg_default_ip_accounting; +static bool arg_default_blockio_accounting; +static bool arg_default_memory_accounting; +static bool arg_default_tasks_accounting; +static TasksMax arg_default_tasks_max; +static sd_id128_t arg_machine_id; +static EmergencyAction arg_cad_burst_action; +static OOMPolicy arg_default_oom_policy; +static CPUSet arg_cpu_affinity; +static NUMAPolicy arg_numa_policy; +static usec_t arg_clock_usec; +static void *arg_random_seed; +static size_t arg_random_seed_size; + +/* A copy of the original environment block */ +static char **saved_env = NULL; + +static int parse_configuration(const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock); + +_noreturn_ static void freeze_or_exit_or_reboot(void) { + + /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to + * the container manager, and thus inform it that something went wrong. */ + if (detect_container() > 0) { + log_emergency("Exiting PID 1..."); + _exit(EXIT_EXCEPTION); + } + + if (arg_crash_reboot) { + log_notice("Rebooting in 10s..."); + (void) sleep(10); + + log_notice("Rebooting now..."); + (void) reboot(RB_AUTOBOOT); + log_emergency_errno(errno, "Failed to reboot: %m"); + } + + log_emergency("Freezing execution."); + freeze(); +} + +_noreturn_ static void crash(int sig) { + struct sigaction sa; + pid_t pid; + + if (getpid_cached() != 1) + /* Pass this on immediately, if this is not PID 1 */ + (void) raise(sig); + else if (!arg_dump_core) + log_emergency("Caught <%s>, not dumping core.", signal_to_string(sig)); + else { + sa = (struct sigaction) { + .sa_handler = nop_signal_handler, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + + /* We want to wait for the core process, hence let's enable SIGCHLD */ + (void) sigaction(SIGCHLD, &sa, NULL); + + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_emergency_errno(errno, "Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig)); + else if (pid == 0) { + /* Enable default signal handler for core dump */ + + sa = (struct sigaction) { + .sa_handler = SIG_DFL, + }; + (void) sigaction(sig, &sa, NULL); + + /* Don't limit the coredump size */ + (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)); + + /* Just to be sure... */ + (void) chdir("/"); + + /* Raise the signal again */ + pid = raw_getpid(); + (void) kill(pid, sig); /* raise() would kill the parent */ + + assert_not_reached("We shouldn't be here..."); + _exit(EXIT_EXCEPTION); + } else { + siginfo_t status; + int r; + + /* Order things nicely. */ + r = wait_for_terminate(pid, &status); + if (r < 0) + log_emergency_errno(r, "Caught <%s>, waitpid() failed: %m", signal_to_string(sig)); + else if (status.si_code != CLD_DUMPED) { + const char *s = status.si_code == CLD_EXITED + ? exit_status_to_string(status.si_status, EXIT_STATUS_LIBC) + : signal_to_string(status.si_status); + + log_emergency("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).", + signal_to_string(sig), + pid, + sigchld_code_to_string(status.si_code), + status.si_status, strna(s)); + } else + log_emergency("Caught <%s>, dumped core as pid "PID_FMT".", + signal_to_string(sig), pid); + } + } + + if (arg_crash_chvt >= 0) + (void) chvt(arg_crash_chvt); + + sa = (struct sigaction) { + .sa_handler = SIG_IGN, + .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART, + }; + + /* Let the kernel reap children for us */ + (void) sigaction(SIGCHLD, &sa, NULL); + + if (arg_crash_shell) { + log_notice("Executing crash shell in 10s..."); + (void) sleep(10); + + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_emergency_errno(errno, "Failed to fork off crash shell: %m"); + else if (pid == 0) { + (void) setsid(); + (void) make_console_stdio(); + (void) rlimit_nofile_safe(); + (void) execle("/bin/sh", "/bin/sh", NULL, environ); + + log_emergency_errno(errno, "execle() failed: %m"); + _exit(EXIT_EXCEPTION); + } else { + log_info("Spawned crash shell as PID "PID_FMT".", pid); + (void) wait_for_terminate(pid, NULL); + } + } + + freeze_or_exit_or_reboot(); +} + +static void install_crash_handler(void) { + static const struct sigaction sa = { + .sa_handler = crash, + .sa_flags = SA_NODEFER, /* So that we can raise the signal again from the signal handler */ + }; + int r; + + /* We ignore the return value here, since, we don't mind if we + * cannot set up a crash handler */ + r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER, -1); + if (r < 0) + log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m"); +} + +static int console_setup(void) { + _cleanup_close_ int tty_fd = -1; + int r; + + tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (tty_fd < 0) + return log_error_errno(tty_fd, "Failed to open /dev/console: %m"); + + /* We don't want to force text mode. plymouth may be showing + * pictures already from initrd. */ + r = reset_terminal_fd(tty_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to reset /dev/console: %m"); + + return 0; +} + +static int set_machine_id(const char *m) { + sd_id128_t t; + assert(m); + + if (sd_id128_from_string(m, &t) < 0) + return -EINVAL; + + if (sd_id128_is_null(t)) + return -EINVAL; + + arg_machine_id = t; + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + int r; + + assert(key); + + if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value); + else if (in_initrd() == !!startswith(key, "rd.")) + return free_and_strdup_warn(&arg_default_unit, value); + + } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value); + else + arg_dump_core = r; + + } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (path_is_absolute(value)) + (void) parse_path_argument_and_warn(value, false, &arg_early_core_pattern); + else + log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) { + + if (!value) + arg_crash_chvt = 0; /* turn on */ + else { + r = parse_crash_chvt(value, &arg_crash_chvt); + if (r < 0) + log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value); + } + + } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value); + else + arg_crash_shell = r; + + } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value); + else + arg_crash_reboot = r; + + } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) { + char *s; + + r = parse_confirm_spawn(value, &s); + if (r < 0) + log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value); + else + free_and_replace(arg_confirm_spawn, s); + + } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value); + else + arg_service_watchdogs = r; + + } else if (proc_cmdline_key_streq(key, "systemd.show_status")) { + + if (value) { + r = parse_show_status(value, &arg_show_status); + if (r < 0) + log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value); + } else + arg_show_status = SHOW_STATUS_YES; + + } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = status_unit_format_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value); + else + arg_status_unit_format = r; + + } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = exec_output_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value); + else + arg_default_std_output = r; + + } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = exec_output_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value); + else + arg_default_std_error = r; + + } else if (streq(key, "systemd.setenv")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (env_assignment_is_valid(value)) { + char **env; + + env = strv_env_set(arg_default_environment, value); + if (!env) + return log_oom(); + + arg_default_environment = env; + } else + log_warning("Environment variable name '%s' is not valid. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = set_machine_id(value); + if (r < 0) + log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value); + + } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &arg_default_timeout_start_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value); + + if (arg_default_timeout_start_usec <= 0) + arg_default_timeout_start_usec = USEC_INFINITY; + + } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_cpu_set(value, &arg_cpu_affinity); + if (r < 0) + log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value); + + } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + (void) parse_path_argument_and_warn(value, false, &arg_watchdog_device); + + } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = safe_atou64(value, &arg_clock_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value); + + } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) { + void *p; + size_t sz; + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = unbase64mem(value, (size_t) -1, &p, &sz); + if (r < 0) + log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value); + + free(arg_random_seed); + arg_random_seed = sz > 0 ? p : mfree(p); + arg_random_seed_size = sz; + + } else if (streq(key, "quiet") && !value) { + + if (arg_show_status == _SHOW_STATUS_INVALID) + arg_show_status = SHOW_STATUS_ERROR; + + } else if (streq(key, "debug") && !value) { + + /* Note that log_parse_environment() handles 'debug' + * too, and sets the log level to LOG_DEBUG. */ + + if (detect_container() > 0) + log_set_target(LOG_TARGET_CONSOLE); + + } else if (!value) { + const char *target; + + /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */ + target = runlevel_to_target(key); + if (target) + return free_and_strdup_warn(&arg_default_unit, target); + } + + return 0; +} + +#define DEFINE_SETTER(name, func, descr) \ + static int name(const char *unit, \ + const char *filename, \ + unsigned line, \ + const char *section, \ + unsigned section_line, \ + const char *lvalue, \ + int ltype, \ + const char *rvalue, \ + void *data, \ + void *userdata) { \ + \ + int r; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + \ + r = func(rvalue); \ + if (r < 0) \ + log_syntax(unit, LOG_ERR, filename, line, r, \ + "Invalid " descr "'%s': %m", \ + rvalue); \ + \ + return 0; \ + } + +DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level"); +DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target"); +DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color"); +DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location"); +DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time"); + +static int config_parse_default_timeout_abort( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + int r; + + r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue, + &arg_default_timeout_abort_usec, userdata); + if (r >= 0) + arg_default_timeout_abort_set = r; + return 0; +} + +static int parse_config_file(void) { + const ConfigTableItem items[] = { + { "Manager", "LogLevel", config_parse_level2, 0, NULL }, + { "Manager", "LogTarget", config_parse_target, 0, NULL }, + { "Manager", "LogColor", config_parse_color, 0, NULL }, + { "Manager", "LogLocation", config_parse_location, 0, NULL }, + { "Manager", "LogTime", config_parse_time, 0, NULL }, + { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core }, + { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt }, + { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt }, + { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell }, + { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot }, + { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status }, + { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format }, + { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity }, + { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type }, + { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy }, + { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, + { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog }, + { "Manager", "RebootWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, + { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */ + { "Manager", "KExecWatchdogSec", config_parse_sec, 0, &arg_kexec_watchdog }, + { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device }, + { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set }, + { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs }, +#if HAVE_SECCOMP + { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs }, +#endif + { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec }, + { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_default_timer_accuracy_usec }, + { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_default_std_output }, + { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_default_std_error }, + { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec }, + { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec }, + { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL }, + { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec }, + { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */ + { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_default_start_limit_interval }, + { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_default_start_limit_burst }, + { "Manager", "DefaultEnvironment", config_parse_environ, 0, &arg_default_environment }, + { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_default_rlimit }, + { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_default_rlimit }, + { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_default_rlimit }, + { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_default_rlimit }, + { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_default_rlimit }, + { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_default_rlimit }, + { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_default_rlimit }, + { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_default_rlimit }, + { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_default_rlimit }, + { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_default_rlimit }, + { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_default_rlimit }, + { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_default_rlimit }, + { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_default_rlimit }, + { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit }, + { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit }, + { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit }, + { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting }, + { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting }, + { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting }, + { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting }, + { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting }, + { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting }, + { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max }, + { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action }, + { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy }, + {} + }; + + const char *fn, *conf_dirs_nulstr; + + fn = arg_system ? + PKGSYSCONFDIR "/system.conf" : + PKGSYSCONFDIR "/user.conf"; + + conf_dirs_nulstr = arg_system ? + CONF_PATHS_NULSTR("systemd/system.conf.d") : + CONF_PATHS_NULSTR("systemd/user.conf.d"); + + (void) config_parse_many_nulstr( + fn, conf_dirs_nulstr, + "Manager\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, + NULL, + NULL); + + /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we used USEC_INFINITY + * like everywhere else. */ + if (arg_default_timeout_start_usec <= 0) + arg_default_timeout_start_usec = USEC_INFINITY; + if (arg_default_timeout_stop_usec <= 0) + arg_default_timeout_stop_usec = USEC_INFINITY; + + return 0; +} + +static void set_manager_defaults(Manager *m) { + + assert(m); + + /* Propagates the various default unit property settings into the manager object, i.e. properties that do not + * affect the manager itself, but are just what newly allocated units will have set if they haven't set + * anything else. (Also see set_manager_settings() for the settings that affect the manager's own behaviour) */ + + m->default_timer_accuracy_usec = arg_default_timer_accuracy_usec; + m->default_std_output = arg_default_std_output; + m->default_std_error = arg_default_std_error; + m->default_timeout_start_usec = arg_default_timeout_start_usec; + m->default_timeout_stop_usec = arg_default_timeout_stop_usec; + m->default_timeout_abort_usec = arg_default_timeout_abort_usec; + m->default_timeout_abort_set = arg_default_timeout_abort_set; + m->default_restart_usec = arg_default_restart_usec; + m->default_start_limit_interval = arg_default_start_limit_interval; + m->default_start_limit_burst = arg_default_start_limit_burst; + + /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU + * controller to be enabled, so the default is to enable it unless we got told otherwise. */ + if (arg_default_cpu_accounting >= 0) + m->default_cpu_accounting = arg_default_cpu_accounting; + else + m->default_cpu_accounting = cpu_accounting_is_cheap(); + + m->default_io_accounting = arg_default_io_accounting; + m->default_ip_accounting = arg_default_ip_accounting; + m->default_blockio_accounting = arg_default_blockio_accounting; + m->default_memory_accounting = arg_default_memory_accounting; + m->default_tasks_accounting = arg_default_tasks_accounting; + m->default_tasks_max = arg_default_tasks_max; + m->default_oom_policy = arg_default_oom_policy; + + (void) manager_set_default_rlimits(m, arg_default_rlimit); + + (void) manager_default_environment(m); + (void) manager_transient_environment_add(m, arg_default_environment); +} + +static void set_manager_settings(Manager *m) { + + assert(m); + + /* Propagates the various manager settings into the manager object, i.e. properties that + * effect the manager itself (as opposed to just being inherited into newly allocated + * units, see set_manager_defaults() above). */ + + m->confirm_spawn = arg_confirm_spawn; + m->service_watchdogs = arg_service_watchdogs; + m->cad_burst_action = arg_cad_burst_action; + + manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog); + manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog); + manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog); + + manager_set_show_status(m, arg_show_status, "commandline"); + m->status_unit_format = arg_status_unit_format; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_LOG_LEVEL = 0x100, + ARG_LOG_TARGET, + ARG_LOG_COLOR, + ARG_LOG_LOCATION, + ARG_LOG_TIME, + ARG_UNIT, + ARG_SYSTEM, + ARG_USER, + ARG_TEST, + ARG_NO_PAGER, + ARG_VERSION, + ARG_DUMP_CONFIGURATION_ITEMS, + ARG_DUMP_BUS_PROPERTIES, + ARG_BUS_INTROSPECT, + ARG_DUMP_CORE, + ARG_CRASH_CHVT, + ARG_CRASH_SHELL, + ARG_CRASH_REBOOT, + ARG_CONFIRM_SPAWN, + ARG_SHOW_STATUS, + ARG_DESERIALIZE, + ARG_SWITCHED_ROOT, + ARG_DEFAULT_STD_OUTPUT, + ARG_DEFAULT_STD_ERROR, + ARG_MACHINE_ID, + ARG_SERVICE_WATCHDOGS, + }; + + static const struct option options[] = { + { "log-level", required_argument, NULL, ARG_LOG_LEVEL }, + { "log-target", required_argument, NULL, ARG_LOG_TARGET }, + { "log-color", optional_argument, NULL, ARG_LOG_COLOR }, + { "log-location", optional_argument, NULL, ARG_LOG_LOCATION }, + { "log-time", optional_argument, NULL, ARG_LOG_TIME }, + { "unit", required_argument, NULL, ARG_UNIT }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "user", no_argument, NULL, ARG_USER }, + { "test", no_argument, NULL, ARG_TEST }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS }, + { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES }, + { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT }, + { "dump-core", optional_argument, NULL, ARG_DUMP_CORE }, + { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT }, + { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL }, + { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT }, + { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN }, + { "show-status", optional_argument, NULL, ARG_SHOW_STATUS }, + { "deserialize", required_argument, NULL, ARG_DESERIALIZE }, + { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT }, + { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, }, + { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, }, + { "machine-id", required_argument, NULL, ARG_MACHINE_ID }, + { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS }, + {} + }; + + int c, r; + + assert(argc >= 1); + assert(argv); + + if (getpid_cached() == 1) + opterr = 0; + + while ((c = getopt_long(argc, argv, "hDbsz:", options, NULL)) >= 0) + + switch (c) { + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg); + + break; + + case ARG_LOG_COLOR: + + if (optarg) { + r = log_show_color_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log color setting \"%s\": %m", + optarg); + } else + log_show_color(true); + + break; + + case ARG_LOG_LOCATION: + if (optarg) { + r = log_show_location_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log location setting \"%s\": %m", + optarg); + } else + log_show_location(true); + + break; + + case ARG_LOG_TIME: + + if (optarg) { + r = log_show_time_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse log time setting \"%s\": %m", + optarg); + } else + log_show_time(true); + + break; + + case ARG_DEFAULT_STD_OUTPUT: + r = exec_output_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m", + optarg); + arg_default_std_output = r; + break; + + case ARG_DEFAULT_STD_ERROR: + r = exec_output_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m", + optarg); + arg_default_std_error = r; + break; + + case ARG_UNIT: + r = free_and_strdup(&arg_default_unit, optarg); + if (r < 0) + return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg); + + break; + + case ARG_SYSTEM: + arg_system = true; + break; + + case ARG_USER: + arg_system = false; + break; + + case ARG_TEST: + arg_action = ACTION_TEST; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_VERSION: + arg_action = ACTION_VERSION; + break; + + case ARG_DUMP_CONFIGURATION_ITEMS: + arg_action = ACTION_DUMP_CONFIGURATION_ITEMS; + break; + + case ARG_DUMP_BUS_PROPERTIES: + arg_action = ACTION_DUMP_BUS_PROPERTIES; + break; + + case ARG_BUS_INTROSPECT: + arg_bus_introspect = optarg; + arg_action = ACTION_BUS_INTROSPECT; + break; + + case ARG_DUMP_CORE: + if (!optarg) + arg_dump_core = true; + else { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse dump core boolean: \"%s\": %m", + optarg); + arg_dump_core = r; + } + break; + + case ARG_CRASH_CHVT: + r = parse_crash_chvt(optarg, &arg_crash_chvt); + if (r < 0) + return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m", + optarg); + break; + + case ARG_CRASH_SHELL: + if (!optarg) + arg_crash_shell = true; + else { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m", + optarg); + arg_crash_shell = r; + } + break; + + case ARG_CRASH_REBOOT: + if (!optarg) + arg_crash_reboot = true; + else { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m", + optarg); + arg_crash_reboot = r; + } + break; + + case ARG_CONFIRM_SPAWN: + arg_confirm_spawn = mfree(arg_confirm_spawn); + + r = parse_confirm_spawn(optarg, &arg_confirm_spawn); + if (r < 0) + return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m", + optarg); + break; + + case ARG_SERVICE_WATCHDOGS: + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse service watchdogs boolean: \"%s\": %m", + optarg); + arg_service_watchdogs = r; + break; + + case ARG_SHOW_STATUS: + if (optarg) { + r = parse_show_status(optarg, &arg_show_status); + if (r < 0) + return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m", + optarg); + } else + arg_show_status = SHOW_STATUS_YES; + break; + + case ARG_DESERIALIZE: { + int fd; + FILE *f; + + r = safe_atoi(optarg, &fd); + if (r < 0) + log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg); + if (fd < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid deserialize fd: %d", + fd); + + (void) fd_cloexec(fd, true); + + f = fdopen(fd, "r"); + if (!f) + return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd); + + safe_fclose(arg_serialization); + arg_serialization = f; + + break; + } + + case ARG_SWITCHED_ROOT: + arg_switched_root = true; + break; + + case ARG_MACHINE_ID: + r = set_machine_id(optarg); + if (r < 0) + return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg); + break; + + case 'h': + arg_action = ACTION_HELP; + break; + + case 'D': + log_set_max_level(LOG_DEBUG); + break; + + case 'b': + case 's': + case 'z': + /* Just to eat away the sysvinit kernel cmdline args that we'll parse in + * parse_proc_cmdline_item() or ignore, without any getopt() error messages. + */ + case '?': + if (getpid_cached() != 1) + return -EINVAL; + else + return 0; + + default: + assert_not_reached("Unhandled option code."); + } + + if (optind < argc && getpid_cached() != 1) + /* Hmm, when we aren't run as init system + * let's complain about excess arguments */ + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Excess arguments."); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "%sStarts and monitors system and user services.%s\n\n" + "This program takes no positional arguments.\n\n" + "%sOptions%s:\n" + " -h --help Show this help\n" + " --version Show version\n" + " --test Determine initial transaction, dump it and exit\n" + " --system In combination with --test: operate as system service manager\n" + " --user In combination with --test: operate as per-user service manager\n" + " --no-pager Do not pipe output into a pager\n" + " --dump-configuration-items Dump understood unit configuration items\n" + " --dump-bus-properties Dump exposed bus properties\n" + " --bus-introspect=PATH Write XML introspection data\n" + " --unit=UNIT Set default unit\n" + " --dump-core[=BOOL] Dump core on crash\n" + " --crash-vt=NR Change to specified VT on crash\n" + " --crash-reboot[=BOOL] Reboot on crash\n" + " --crash-shell[=BOOL] Run shell on crash\n" + " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n" + " --show-status[=BOOL] Show status updates on the console during bootup\n" + " --log-target=TARGET Set log target (console, journal, kmsg, journal-or-kmsg, null)\n" + " --log-level=LEVEL Set log level (debug, info, notice, warning, err, crit, alert, emerg)\n" + " --log-color[=BOOL] Highlight important log messages\n" + " --log-location[=BOOL] Include code location in log messages\n" + " --log-time[=BOOL] Prefix log messages with current time\n" + " --default-standard-output= Set default standard output for services\n" + " --default-standard-error= Set default standard error output for services\n" + "\nSee the %s for details.\n" + , program_invocation_short_name + , ansi_highlight(), ansi_normal() + , ansi_underline(), ansi_normal() + , link + ); + + return 0; +} + +static int prepare_reexecute( + Manager *m, + FILE **ret_f, + FDSet **ret_fds, + bool switching_root) { + + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + assert(ret_f); + assert(ret_fds); + + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + + /* Make sure nothing is really destructed when we shut down */ + m->n_reloading++; + bus_manager_send_reloading(m, true); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + r = manager_serialize(m, f, fds, switching_root); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) == (off_t) -1) + return log_error_errno(errno, "Failed to rewind serialization fd: %m"); + + r = fd_cloexec(fileno(f), false); + if (r < 0) + return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m"); + + r = fdset_cloexec(fds, false); + if (r < 0) + return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m"); + + *ret_f = TAKE_PTR(f); + *ret_fds = TAKE_PTR(fds); + + return 0; +} + +static void bump_file_max_and_nr_open(void) { + + /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file + * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting + * them and limiting them in another two layers of limits is unnecessary and just complicates things. This + * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft + + * hard) the only ones that really matter. */ + +#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN + int r; +#endif + +#if BUMP_PROC_SYS_FS_FILE_MAX + /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously thing where + * different but the operation would fail silently.) */ + r = sysctl_writef("fs/file-max", "%li\n", LONG_MAX); + if (r < 0) + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m"); +#endif + +#if BUMP_PROC_SYS_FS_NR_OPEN + int v = INT_MAX; + + /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they + * are. The expression by which the maximum is determined is dependent on the architecture, and is something we + * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since + * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with + * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel + * APIs are kernel APIs, so what do can we do... 🤯 */ + + for (;;) { + int k; + + v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */ + if (v < 1024) { + log_warning("Can't bump fs.nr_open, value too small."); + break; + } + + k = read_nr_open(); + if (k < 0) { + log_error_errno(k, "Failed to read fs.nr_open: %m"); + break; + } + if (k >= v) { /* Already larger */ + log_debug("Skipping bump, value is already larger."); + break; + } + + r = sysctl_writef("fs/nr_open", "%i\n", v); + if (r == -EINVAL) { + log_debug("Couldn't write fs.nr_open as %i, halving it.", v); + v /= 2; + continue; + } + if (r < 0) { + log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m"); + break; + } + + log_debug("Successfully bumped fs.nr_open to %i", v); + break; + } +#endif +} + +static int bump_rlimit_nofile(struct rlimit *saved_rlimit) { + struct rlimit new_rlimit; + int r, nr; + + /* Get the underlying absolute limit the kernel enforces */ + nr = read_nr_open(); + + /* Calculate the new limits to use for us. Never lower from what we inherited. */ + new_rlimit = (struct rlimit) { + .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur), + .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max), + }; + + /* Shortcut if nothing changes. */ + if (saved_rlimit->rlim_max >= new_rlimit.rlim_max && + saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) { + log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping."); + return 0; + } + + /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for + * both hard and soft. */ + r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m"); + + return 0; +} + +static int bump_rlimit_memlock(struct rlimit *saved_rlimit) { + struct rlimit new_rlimit; + uint64_t mm; + int r; + + /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK which should + * normally disable such checks. We need them to implement IPAddressAllow= and IPAddressDeny=, hence let's bump + * the value high enough for our user. */ + + /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t + * must be unsigned, hence this is a given, but let's make this clear here. */ + assert_cc(RLIM_INFINITY > 0); + + mm = physical_memory() / 8; /* Let's scale how much we allow to be locked by the amount of physical + * RAM. We allow an eighth to be locked by us, just to pick a value. */ + + new_rlimit = (struct rlimit) { + .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm), + .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm), + }; + + if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur && + saved_rlimit->rlim_cur >= new_rlimit.rlim_max) { + log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping."); + return 0; + } + + r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m"); + + return 0; +} + +static void test_usr(void) { + + /* Check that /usr is either on the same file system as / or mounted already. */ + + if (dir_is_empty("/usr") <= 0) + return; + + log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. " + "Some things will probably break (sometimes even silently) in mysterious ways. " + "Consult http://freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information."); +} + +static int enforce_syscall_archs(Set *archs) { +#if HAVE_SECCOMP + int r; + + if (!is_seccomp_available()) + return 0; + + r = seccomp_restrict_archs(arg_syscall_archs); + if (r < 0) + return log_error_errno(r, "Failed to enforce system call architecture restrication: %m"); +#endif + return 0; +} + +static int status_welcome(void) { + _cleanup_free_ char *pretty_name = NULL, *ansi_color = NULL; + int r; + + if (!show_status_on(arg_show_status)) + return 0; + + r = parse_os_release(NULL, + "PRETTY_NAME", &pretty_name, + "ANSI_COLOR", &ansi_color, + NULL); + if (r < 0) + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to read os-release file, ignoring: %m"); + + if (log_get_show_color()) + return status_printf(NULL, 0, + "\nWelcome to \x1B[%sm%s\x1B[0m!\n", + isempty(ansi_color) ? "1" : ansi_color, + isempty(pretty_name) ? "Linux" : pretty_name); + else + return status_printf(NULL, 0, + "\nWelcome to %s!\n", + isempty(pretty_name) ? "Linux" : pretty_name); +} + +static int write_container_id(void) { + const char *c; + int r; + + c = getenv("container"); + if (isempty(c)) + return 0; + + RUN_WITH_UMASK(0022) + r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE); + if (r < 0) + return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m"); + + return 1; +} + +static int bump_unix_max_dgram_qlen(void) { + _cleanup_free_ char *qlen = NULL; + unsigned long v; + int r; + + /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set the value + * really really early during boot, so that it is actually applied to all our sockets, including the + * $NOTIFY_SOCKET one. */ + + r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen); + if (r < 0) + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to read AF_UNIX datagram queue length, ignoring: %m"); + + r = safe_atolu(qlen, &v); + if (r < 0) + return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen); + + if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN) + return 0; + + r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN); + if (r < 0) + return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to bump AF_UNIX datagram queue length, ignoring: %m"); + + return 1; +} + +static int fixup_environment(void) { + _cleanup_free_ char *term = NULL; + const char *t; + int r; + + /* Only fix up the environment when we are started as PID 1 */ + if (getpid_cached() != 1) + return 0; + + /* We expect the environment to be set correctly if run inside a container. */ + if (detect_container() > 0) + return 0; + + /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the backend + * device used by the console. We try to make a better guess here since some consoles might not have support + * for color mode for example. + * + * However if TERM was configured through the kernel command line then leave it alone. */ + r = proc_cmdline_get_key("TERM", 0, &term); + if (r < 0) + return r; + + t = term ?: default_term_for_tty("/dev/console"); + + if (setenv("TERM", t, 1) < 0) + return -errno; + + /* The kernels sets HOME=/ for init. Let's undo this. */ + if (path_equal_ptr(getenv("HOME"), "/")) + assert_se(unsetenv("HOME") == 0); + + return 0; +} + +static void redirect_telinit(int argc, char *argv[]) { + + /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */ + +#if HAVE_SYSV_COMPAT + if (getpid_cached() == 1) + return; + + if (!strstr(program_invocation_short_name, "init")) + return; + + execv(SYSTEMCTL_BINARY_PATH, argv); + log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m"); + exit(EXIT_FAILURE); +#endif +} + +static int become_shutdown( + const char *shutdown_verb, + int retval) { + + char log_level[DECIMAL_STR_MAX(int) + 1], + exit_code[DECIMAL_STR_MAX(uint8_t) + 1], + timeout[DECIMAL_STR_MAX(usec_t) + 1]; + + const char* command_line[13] = { + SYSTEMD_SHUTDOWN_BINARY_PATH, + shutdown_verb, + "--timeout", timeout, + "--log-level", log_level, + "--log-target", + }; + + _cleanup_strv_free_ char **env_block = NULL; + size_t pos = 7; + int r; + usec_t watchdog_timer = 0; + + assert(shutdown_verb); + assert(!command_line[pos]); + env_block = strv_copy(environ); + + xsprintf(log_level, "%d", log_get_max_level()); + xsprintf(timeout, "%" PRI_USEC "us", arg_default_timeout_stop_usec); + + switch (log_get_target()) { + + case LOG_TARGET_KMSG: + case LOG_TARGET_JOURNAL_OR_KMSG: + case LOG_TARGET_SYSLOG_OR_KMSG: + command_line[pos++] = "kmsg"; + break; + + case LOG_TARGET_NULL: + command_line[pos++] = "null"; + break; + + case LOG_TARGET_CONSOLE: + default: + command_line[pos++] = "console"; + break; + }; + + if (log_get_show_color()) + command_line[pos++] = "--log-color"; + + if (log_get_show_location()) + command_line[pos++] = "--log-location"; + + if (log_get_show_time()) + command_line[pos++] = "--log-time"; + + if (streq(shutdown_verb, "exit")) { + command_line[pos++] = "--exit-code"; + command_line[pos++] = exit_code; + xsprintf(exit_code, "%d", retval); + } + + assert(pos < ELEMENTSOF(command_line)); + + if (streq(shutdown_verb, "reboot")) + watchdog_timer = arg_reboot_watchdog; + else if (streq(shutdown_verb, "kexec")) + watchdog_timer = arg_kexec_watchdog; + + if (watchdog_timer > 0 && watchdog_timer != USEC_INFINITY) { + + char *e; + + /* If we reboot or kexec let's set the shutdown + * watchdog and tell the shutdown binary to + * repeatedly ping it */ + r = watchdog_set_timeout(&watchdog_timer); + watchdog_close(r < 0); + + /* Tell the binary how often to ping, ignore failure */ + if (asprintf(&e, "WATCHDOG_USEC="USEC_FMT, watchdog_timer) > 0) + (void) strv_consume(&env_block, e); + + if (arg_watchdog_device && + asprintf(&e, "WATCHDOG_DEVICE=%s", arg_watchdog_device) > 0) + (void) strv_consume(&env_block, e); + } else + watchdog_close(true); + + /* Avoid the creation of new processes forked by the + * kernel; at this point, we will not listen to the + * signals anyway */ + if (detect_container() <= 0) + (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER); + + execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block); + return -errno; +} + +static void initialize_clock(void) { + int r; + + /* This is called very early on, before we parse the kernel command line or otherwise figure out why + * we are running, but only once. */ + + if (clock_is_localtime(NULL) > 0) { + int min; + + /* + * The very first call of settimeofday() also does a time warp in the kernel. + * + * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to take care + * of maintaining the RTC and do all adjustments. This matches the behavior of Windows, which leaves + * the RTC alone if the registry tells that the RTC runs in UTC. + */ + r = clock_set_timezone(&min); + if (r < 0) + log_error_errno(r, "Failed to apply local time delta, ignoring: %m"); + else + log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min); + + } else if (!in_initrd()) + /* + * Do a dummy very first call to seal the kernel's time warp magic. + * + * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with LOCAL, but the + * real system could be set up that way. In such case, we need to delay the time-warp or the sealing + * until we reach the real system. + * + * Do no set the kernel's timezone. The concept of local time cannot be supported reliably, the time + * will jump or be incorrect at every daylight saving time change. All kernel local time concepts will + * be treated as UTC that way. + */ + (void) clock_reset_timewarp(); + + r = clock_apply_epoch(); + if (r < 0) + log_error_errno(r, "Current system time is before build time, but cannot correct: %m"); + else if (r > 0) + log_info("System time before build time, advancing clock."); +} + +static void apply_clock_update(void) { + struct timespec ts; + + /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel + * command line and such. */ + + if (arg_clock_usec == 0) + return; + + if (getpid_cached() != 1) + return; + + if (clock_settime(CLOCK_REALTIME, timespec_store(&ts, arg_clock_usec)) < 0) + log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m"); + else { + char buf[FORMAT_TIMESTAMP_MAX]; + + log_info("Set system clock to %s, as specified on the kernel command line.", + format_timestamp(buf, sizeof(buf), arg_clock_usec)); + } +} + +static void cmdline_take_random_seed(void) { + _cleanup_close_ int random_fd = -1; + size_t suggested; + int r; + + if (arg_random_seed_size == 0) + return; + + if (getpid_cached() != 1) + return; + + assert(arg_random_seed); + suggested = random_pool_size(); + + if (arg_random_seed_size < suggested) + log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.", + arg_random_seed_size, suggested); + + random_fd = open("/dev/urandom", O_WRONLY|O_CLOEXEC|O_NOCTTY); + if (random_fd < 0) { + log_warning_errno(errno, "Failed to open /dev/urandom for writing, ignoring: %m"); + return; + } + + r = random_write_entropy(random_fd, arg_random_seed, arg_random_seed_size, true); + if (r < 0) { + log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m"); + return; + } + + log_notice("Successfully credited entropy passed on kernel command line.\n" + "Note that the seed provided this way is accessible to unprivileged programs. This functionality should not be used outside of testing environments."); +} + +static void initialize_coredump(bool skip_setup) { +#if ENABLE_COREDUMP + if (getpid_cached() != 1) + return; + + /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour the limit) + * will process core dumps for system services by default. */ + if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0) + log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m"); + + /* But at the same time, turn off the core_pattern logic by default, so that no + * coredumps are stored until the systemd-coredump tool is enabled via + * sysctl. However it can be changed via the kernel command line later so core + * dumps can still be generated during early startup and in initramfs. */ + if (!skip_setup) + disable_coredumps(); +#endif +} + +static void initialize_core_pattern(bool skip_setup) { + int r; + + if (skip_setup || !arg_early_core_pattern) + return; + + if (getpid_cached() != 1) + return; + + r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern); +} + +static void update_cpu_affinity(bool skip_setup) { + _cleanup_free_ char *mask = NULL; + + if (skip_setup || !arg_cpu_affinity.set) + return; + + assert(arg_cpu_affinity.allocated > 0); + + mask = cpu_set_to_string(&arg_cpu_affinity); + log_debug("Setting CPU affinity to %s.", strnull(mask)); + + if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0) + log_warning_errno(errno, "Failed to set CPU affinity: %m"); +} + +static void update_numa_policy(bool skip_setup) { + int r; + _cleanup_free_ char *nodes = NULL; + const char * policy = NULL; + + if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy))) + return; + + if (DEBUG_LOGGING) { + policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy)); + nodes = cpu_set_to_range_string(&arg_numa_policy.nodes); + log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes)); + } + + r = apply_numa_policy(&arg_numa_policy); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "NUMA support not available, ignoring."); + else if (r < 0) + log_warning_errno(r, "Failed to set NUMA memory policy: %m"); +} + +static void do_reexecute( + int argc, + char *argv[], + const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock, + FDSet *fds, + const char *switch_root_dir, + const char *switch_root_init, + const char **ret_error_message) { + + unsigned i, j, args_size; + const char **args; + int r; + + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + assert(ret_error_message); + + /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get rebooted while + * we do that */ + watchdog_close(true); + + /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass + * the kernel default to its child processes */ + if (saved_rlimit_nofile->rlim_cur != 0) + (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile); + if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY) + (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock); + + if (switch_root_dir) { + /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the + * SIGCHLD for them after deserializing. */ + broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec); + + /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */ + r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE); + if (r < 0) + log_error_errno(r, "Failed to switch root, trying to continue: %m"); + } + + args_size = MAX(6, argc+1); + args = newa(const char*, args_size); + + if (!switch_root_init) { + char sfd[DECIMAL_STR_MAX(int) + 1]; + + /* First try to spawn ourselves with the right path, and with full serialization. We do this only if + * the user didn't specify an explicit init to spawn. */ + + assert(arg_serialization); + assert(fds); + + xsprintf(sfd, "%i", fileno(arg_serialization)); + + i = 0; + args[i++] = SYSTEMD_BINARY_PATH; + if (switch_root_dir) + args[i++] = "--switched-root"; + args[i++] = arg_system ? "--system" : "--user"; + args[i++] = "--deserialize"; + args[i++] = sfd; + args[i++] = NULL; + + assert(i <= args_size); + + /* + * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do this is on + * its own on exec(), but it will do it on exit(). Hence, to ensure we get a summary here, fork() off + * a child, let it exit() cleanly, so that it prints the summary, and wait() for it in the parent, + * before proceeding into the exec(). + */ + valgrind_summary_hack(); + + (void) execv(args[0], (char* const*) args); + log_debug_errno(errno, "Failed to execute our own binary, trying fallback: %m"); + } + + /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and envp[]. (Well, + * modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[], but let's hope that + * doesn't matter.) */ + + arg_serialization = safe_fclose(arg_serialization); + fds = fdset_free(fds); + + /* Reopen the console */ + (void) make_console_stdio(); + + for (j = 1, i = 1; j < (unsigned) argc; j++) + args[i++] = argv[j]; + args[i++] = NULL; + assert(i <= args_size); + + /* Re-enable any blocked signals, especially important if we switch from initial ramdisk to init=... */ + (void) reset_all_signal_handlers(); + (void) reset_signal_mask(); + (void) rlimit_nofile_safe(); + + if (switch_root_init) { + args[0] = switch_root_init; + (void) execve(args[0], (char* const*) args, saved_env); + log_warning_errno(errno, "Failed to execute configured init, trying fallback: %m"); + } + + args[0] = "/sbin/init"; + (void) execv(args[0], (char* const*) args); + r = -errno; + + manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, + "Failed to execute /sbin/init"); + + if (r == -ENOENT) { + log_warning("No /sbin/init, trying fallback"); + + args[0] = "/bin/sh"; + args[1] = NULL; + (void) execve(args[0], (char* const*) args, saved_env); + log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m"); + } else + log_warning_errno(r, "Failed to execute /sbin/init, giving up: %m"); + + *ret_error_message = "Failed to execute fallback shell"; +} + +static int invoke_main_loop( + Manager *m, + const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock, + bool *ret_reexecute, + int *ret_retval, /* Return parameters relevant for shutting down */ + const char **ret_shutdown_verb, /* … */ + FDSet **ret_fds, /* Return parameters for reexecuting */ + char **ret_switch_root_dir, /* … */ + char **ret_switch_root_init, /* … */ + const char **ret_error_message) { + + int r; + + assert(m); + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + assert(ret_reexecute); + assert(ret_retval); + assert(ret_shutdown_verb); + assert(ret_fds); + assert(ret_switch_root_dir); + assert(ret_switch_root_init); + assert(ret_error_message); + + for (;;) { + r = manager_loop(m); + if (r < 0) { + *ret_error_message = "Failed to run main loop"; + return log_emergency_errno(r, "Failed to run main loop: %m"); + } + + switch ((ManagerObjective) r) { + + case MANAGER_RELOAD: { + LogTarget saved_log_target; + int saved_log_level; + + log_info("Reloading."); + + /* First, save any overridden log level/target, then parse the configuration file, which might + * change the log level to new settings. */ + + saved_log_level = m->log_level_overridden ? log_get_max_level() : -1; + saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID; + + (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock); + + set_manager_defaults(m); + set_manager_settings(m); + + update_cpu_affinity(false); + update_numa_policy(false); + + if (saved_log_level >= 0) + manager_override_log_level(m, saved_log_level); + if (saved_log_target >= 0) + manager_override_log_target(m, saved_log_target); + + r = manager_reload(m); + if (r < 0) + /* Reloading failed before the point of no return. Let's continue running as if nothing happened. */ + m->objective = MANAGER_OK; + + break; + } + + case MANAGER_REEXECUTE: + + r = prepare_reexecute(m, &arg_serialization, ret_fds, false); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + + log_notice("Reexecuting."); + + *ret_reexecute = true; + *ret_retval = EXIT_SUCCESS; + *ret_shutdown_verb = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return 0; + + case MANAGER_SWITCH_ROOT: + if (!m->switch_root_init) { + r = prepare_reexecute(m, &arg_serialization, ret_fds, true); + if (r < 0) { + *ret_error_message = "Failed to prepare for reexecution"; + return r; + } + } else + *ret_fds = NULL; + + log_notice("Switching root."); + + *ret_reexecute = true; + *ret_retval = EXIT_SUCCESS; + *ret_shutdown_verb = NULL; + + /* Steal the switch root parameters */ + *ret_switch_root_dir = TAKE_PTR(m->switch_root); + *ret_switch_root_init = TAKE_PTR(m->switch_root_init); + + return 0; + + case MANAGER_EXIT: + + if (MANAGER_IS_USER(m)) { + log_debug("Exit."); + + *ret_reexecute = false; + *ret_retval = m->return_value; + *ret_shutdown_verb = NULL; + *ret_fds = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return 0; + } + + _fallthrough_; + case MANAGER_REBOOT: + case MANAGER_POWEROFF: + case MANAGER_HALT: + case MANAGER_KEXEC: { + static const char * const table[_MANAGER_OBJECTIVE_MAX] = { + [MANAGER_EXIT] = "exit", + [MANAGER_REBOOT] = "reboot", + [MANAGER_POWEROFF] = "poweroff", + [MANAGER_HALT] = "halt", + [MANAGER_KEXEC] = "kexec", + }; + + log_notice("Shutting down."); + + *ret_reexecute = false; + *ret_retval = m->return_value; + assert_se(*ret_shutdown_verb = table[m->objective]); + *ret_fds = NULL; + *ret_switch_root_dir = *ret_switch_root_init = NULL; + + return 0; + } + + default: + assert_not_reached("Unknown or unexpected manager objective."); + } + } +} + +static void log_execution_mode(bool *ret_first_boot) { + assert(ret_first_boot); + + if (arg_system) { + int v; + + log_info("systemd " GIT_VERSION " running in %ssystem mode. (" SYSTEMD_FEATURES ")", + arg_action == ACTION_TEST ? "test " : "" ); + + v = detect_virtualization(); + if (v > 0) + log_info("Detected virtualization %s.", virtualization_to_string(v)); + + log_info("Detected architecture %s.", architecture_to_string(uname_architecture())); + + if (in_initrd()) { + *ret_first_boot = false; + log_info("Running in initial RAM disk."); + } else { + int r; + _cleanup_free_ char *id_text = NULL; + + /* Let's check whether we are in first boot. We use /etc/machine-id as flag file + * for this: If it is missing or contains the value "uninitialized", this is the + * first boot. In any other case, it is not. This allows container managers and + * installers to provision a couple of files already. If the container manager + * wants to provision the machine ID itself it should pass $container_uuid to PID 1. */ + + r = read_one_line_file("/etc/machine-id", &id_text); + if (r < 0 || streq(id_text, "uninitialized")) { + if (r < 0 && r != -ENOENT) + log_warning_errno(r, "Unexpected error while reading /etc/machine-id, ignoring: %m"); + + *ret_first_boot = true; + log_info("Detected first boot."); + } else { + *ret_first_boot = false; + log_debug("Detected initialized system, this is not the first boot."); + } + } + } else { + if (DEBUG_LOGGING) { + _cleanup_free_ char *t; + + t = uid_to_name(getuid()); + log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (" SYSTEMD_FEATURES ")", + arg_action == ACTION_TEST ? " test" : "", getuid(), strna(t)); + } + + *ret_first_boot = false; + } +} + +static int initialize_runtime( + bool skip_setup, + bool first_boot, + struct rlimit *saved_rlimit_nofile, + struct rlimit *saved_rlimit_memlock, + const char **ret_error_message) { + int r; + + assert(ret_error_message); + + /* Sets up various runtime parameters. Many of these initializations are conditionalized: + * + * - Some only apply to --system instances + * - Some only apply to --user instances + * - Some only apply when we first start up, but not when we reexecute + */ + + if (arg_action != ACTION_RUN) + return 0; + + update_cpu_affinity(skip_setup); + update_numa_policy(skip_setup); + + if (arg_system) { + /* Make sure we leave a core dump without panicking the kernel. */ + install_crash_handler(); + + if (!skip_setup) { + r = mount_cgroup_controllers(); + if (r < 0) { + *ret_error_message = "Failed to mount cgroup hierarchies"; + return r; + } + + status_welcome(); + hostname_setup(); + /* Force transient machine-id on first boot. */ + machine_id_setup(NULL, first_boot, arg_machine_id, NULL); + (void) loopback_setup(); + bump_unix_max_dgram_qlen(); + bump_file_max_and_nr_open(); + test_usr(); + write_container_id(); + } + + if (arg_watchdog_device) { + r = watchdog_set_device(arg_watchdog_device); + if (r < 0) + log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device); + } + } else { + _cleanup_free_ char *p = NULL; + + /* Create the runtime directory and place the inaccessible device nodes there, if we run in + * user mode. In system mode mount_setup() already did that. */ + + r = xdg_user_runtime_dir(&p, "/systemd"); + if (r < 0) { + *ret_error_message = "$XDG_RUNTIME_DIR is not set"; + return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m"); + } + + (void) mkdir_p_label(p, 0755); + (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID); + } + + if (arg_timer_slack_nsec != NSEC_INFINITY) + if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0) + log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m"); + + if (arg_system && !cap_test_all(arg_capability_bounding_set)) { + r = capability_bounding_set_drop_usermode(arg_capability_bounding_set); + if (r < 0) { + *ret_error_message = "Failed to drop capability bounding set of usermode helpers"; + return log_emergency_errno(r, "Failed to drop capability bounding set of usermode helpers: %m"); + } + + r = capability_bounding_set_drop(arg_capability_bounding_set, true); + if (r < 0) { + *ret_error_message = "Failed to drop capability bounding set"; + return log_emergency_errno(r, "Failed to drop capability bounding set: %m"); + } + } + + if (arg_system && arg_no_new_privs) { + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { + *ret_error_message = "Failed to disable new privileges"; + return log_emergency_errno(errno, "Failed to disable new privileges: %m"); + } + } + + if (arg_syscall_archs) { + r = enforce_syscall_archs(arg_syscall_archs); + if (r < 0) { + *ret_error_message = "Failed to set syscall architectures"; + return r; + } + } + + if (!arg_system) + /* Become reaper of our children */ + if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) + log_warning_errno(errno, "Failed to make us a subreaper: %m"); + + /* Bump up RLIMIT_NOFILE for systemd itself */ + (void) bump_rlimit_nofile(saved_rlimit_nofile); + (void) bump_rlimit_memlock(saved_rlimit_memlock); + + return 0; +} + +static int do_queue_default_job( + Manager *m, + const char **ret_error_message) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *unit; + Job *job; + Unit *target; + int r; + + if (arg_default_unit) + unit = arg_default_unit; + else if (in_initrd()) + unit = SPECIAL_INITRD_TARGET; + else + unit = SPECIAL_DEFAULT_TARGET; + + log_debug("Activating default unit: %s", unit); + + r = manager_load_startable_unit_or_warn(m, unit, NULL, &target); + if (r < 0 && in_initrd() && !arg_default_unit) { + /* Fall back to default.target, which we used to always use by default. Only do this if no + * explicit configuration was given. */ + + log_info("Falling back to " SPECIAL_DEFAULT_TARGET "."); + + r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target); + } + if (r < 0) { + log_info("Falling back to " SPECIAL_RESCUE_TARGET "."); + + r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target); + if (r < 0) { + *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked" + : "Failed to load " SPECIAL_RESCUE_TARGET; + return r; + } + } + + assert(target->load_state == UNIT_LOADED); + + r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job); + if (r == -EPERM) { + log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r)); + + sd_bus_error_free(&error); + + r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job); + if (r < 0) { + *ret_error_message = "Failed to start default target"; + return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r)); + } + + } else if (r < 0) { + *ret_error_message = "Failed to isolate default target"; + return log_emergency_errno(r, "Failed to isolate default target: %s", bus_error_message(&error, r)); + } else + log_info("Queued %s job for default target %s.", + job_type_to_string(job->type), + unit_status_string(job->unit)); + + m->default_unit_job_id = job->id; + + return 0; +} + +static void save_rlimits(struct rlimit *saved_rlimit_nofile, + struct rlimit *saved_rlimit_memlock) { + + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + + if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0) + log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m"); + + if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0) + log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m"); +} + +static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) { + struct rlimit *rl; + + if (arg_default_rlimit[RLIMIT_NOFILE]) + return; + + /* Make sure forked processes get limits based on the original kernel setting */ + + rl = newdup(struct rlimit, saved_rlimit_nofile, 1); + if (!rl) { + log_oom(); + return; + } + + /* Bump the hard limit for system services to a substantially higher value. The default + * hard limit current kernels set is pretty low (4K), mostly for historical + * reasons. According to kernel developers, the fd handling in recent kernels has been + * optimized substantially enough, so that we can bump the limit now, without paying too + * high a price in memory or performance. Note however that we only bump the hard limit, + * not the soft limit. That's because select() works the way it works, and chokes on fds + * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to + * unexpecting programs that they get fds higher than what they can process using + * select(). By only bumping the hard limit but leaving the low limit as it is we avoid + * this pitfall: programs that are written by folks aware of the select() problem in mind + * (and thus use poll()/epoll instead of select(), the way everybody should) can + * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit + * we pass. */ + if (arg_system) { + int nr; + + /* Get the underlying absolute limit the kernel enforces */ + nr = read_nr_open(); + + rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE)); + } + + /* If for some reason we were invoked with a soft limit above 1024 (which should never + * happen!, but who knows what we get passed in from pam_limit when invoked as --user + * instance), then lower what we pass on to not confuse our children */ + rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE); + + arg_default_rlimit[RLIMIT_NOFILE] = rl; +} + +static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) { + struct rlimit *rl; + + /* Pass the original value down to invoked processes */ + + if (arg_default_rlimit[RLIMIT_MEMLOCK]) + return; + + rl = newdup(struct rlimit, saved_rlimit_memlock, 1); + if (!rl) { + log_oom(); + return; + } + + arg_default_rlimit[RLIMIT_MEMLOCK] = rl; +} + +static void reset_arguments(void) { + /* Frees/resets arg_* variables, with a few exceptions commented below. */ + + arg_default_unit = mfree(arg_default_unit); + + /* arg_system — ignore */ + + arg_dump_core = true; + arg_crash_chvt = -1; + arg_crash_shell = false; + arg_crash_reboot = false; + arg_confirm_spawn = mfree(arg_confirm_spawn); + arg_show_status = _SHOW_STATUS_INVALID; + arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT; + arg_switched_root = false; + arg_pager_flags = 0; + arg_service_watchdogs = true; + arg_default_std_output = EXEC_OUTPUT_JOURNAL; + arg_default_std_error = EXEC_OUTPUT_INHERIT; + arg_default_restart_usec = DEFAULT_RESTART_USEC; + arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC; + arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC; + arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC; + arg_default_timeout_abort_set = false; + arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL; + arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST; + arg_runtime_watchdog = 0; + arg_reboot_watchdog = 10 * USEC_PER_MINUTE; + arg_kexec_watchdog = 0; + arg_early_core_pattern = NULL; + arg_watchdog_device = NULL; + + arg_default_environment = strv_free(arg_default_environment); + rlimit_free_all(arg_default_rlimit); + + arg_capability_bounding_set = CAP_ALL; + arg_no_new_privs = false; + arg_timer_slack_nsec = NSEC_INFINITY; + arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE; + + arg_syscall_archs = set_free(arg_syscall_archs); + + /* arg_serialization — ignore */ + + arg_default_cpu_accounting = -1; + arg_default_io_accounting = false; + arg_default_ip_accounting = false; + arg_default_blockio_accounting = false; + arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT; + arg_default_tasks_accounting = true; + arg_default_tasks_max = DEFAULT_TASKS_MAX; + arg_machine_id = (sd_id128_t) {}; + arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE; + arg_default_oom_policy = OOM_STOP; + + cpu_set_reset(&arg_cpu_affinity); + numa_policy_reset(&arg_numa_policy); + + arg_random_seed = mfree(arg_random_seed); + arg_random_seed_size = 0; + arg_clock_usec = 0; +} + +static int parse_configuration(const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock) { + int r; + + assert(saved_rlimit_nofile); + assert(saved_rlimit_memlock); + + /* Assign configuration defaults */ + reset_arguments(); + + r = parse_config_file(); + if (r < 0) + log_warning_errno(r, "Failed to parse config file, ignoring: %m"); + + if (arg_system) { + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + } + + /* Initialize some default rlimits for services if they haven't been configured */ + fallback_rlimit_nofile(saved_rlimit_nofile); + fallback_rlimit_memlock(saved_rlimit_memlock); + + /* Note that this also parses bits from the kernel command line, including "debug". */ + log_parse_environment(); + + /* Initialize the show status setting if it hasn't been set explicitly yet */ + if (arg_show_status == _SHOW_STATUS_INVALID) + arg_show_status = SHOW_STATUS_YES; + + return 0; +} + +static int safety_checks(void) { + + if (getpid_cached() == 1 && + arg_action != ACTION_RUN) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Unsupported execution mode while PID 1."); + + if (getpid_cached() == 1 && + !arg_system) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run --user mode as PID 1."); + + if (arg_action == ACTION_RUN && + arg_system && + getpid_cached() != 1) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Can't run system mode unless PID 1."); + + if (arg_action == ACTION_TEST && + geteuid() == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Don't run test mode as root."); + + if (!arg_system && + arg_action == ACTION_RUN && + sd_booted() <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Trying to run as user instance, but the system has not been booted with systemd."); + + if (!arg_system && + arg_action == ACTION_RUN && + !getenv("XDG_RUNTIME_DIR")) + return log_error_errno(SYNTHETIC_ERRNO(EUNATCH), + "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set."); + + if (arg_system && + arg_action == ACTION_RUN && + running_in_chroot() > 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cannot be run in a chroot() environment."); + + return 0; +} + +static int initialize_security( + bool *loaded_policy, + dual_timestamp *security_start_timestamp, + dual_timestamp *security_finish_timestamp, + const char **ret_error_message) { + + int r; + + assert(loaded_policy); + assert(security_start_timestamp); + assert(security_finish_timestamp); + assert(ret_error_message); + + dual_timestamp_get(security_start_timestamp); + + r = mac_selinux_setup(loaded_policy); + if (r < 0) { + *ret_error_message = "Failed to load SELinux policy"; + return r; + } + + r = mac_smack_setup(loaded_policy); + if (r < 0) { + *ret_error_message = "Failed to load SMACK policy"; + return r; + } + + r = mac_apparmor_setup(); + if (r < 0) { + *ret_error_message = "Failed to load AppArmor policy"; + return r; + } + + r = ima_setup(); + if (r < 0) { + *ret_error_message = "Failed to load IMA policy"; + return r; + } + + dual_timestamp_get(security_finish_timestamp); + return 0; +} + +static void test_summary(Manager *m) { + assert(m); + + printf("-> By units:\n"); + manager_dump_units(m, stdout, "\t"); + + printf("-> By jobs:\n"); + manager_dump_jobs(m, stdout, "\t"); +} + +static int collect_fds(FDSet **ret_fds, const char **ret_error_message) { + int r; + + assert(ret_fds); + assert(ret_error_message); + + r = fdset_new_fill(ret_fds); + if (r < 0) { + *ret_error_message = "Failed to allocate fd set"; + return log_emergency_errno(r, "Failed to allocate fd set: %m"); + } + + fdset_cloexec(*ret_fds, true); + + if (arg_serialization) + assert_se(fdset_remove(*ret_fds, fileno(arg_serialization)) >= 0); + + return 0; +} + +static void setup_console_terminal(bool skip_setup) { + + if (!arg_system) + return; + + /* Become a session leader if we aren't one yet. */ + (void) setsid(); + + /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a controlling + * tty. */ + (void) release_terminal(); + + /* Reset the console, but only if this is really init and we are freshly booted */ + if (getpid_cached() == 1 && !skip_setup) + (void) console_setup(); +} + +static bool early_skip_setup_check(int argc, char *argv[]) { + bool found_deserialize = false; + int i; + + /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much later, so + * let's just have a quick peek here. Note that if we have switched root, do all the special setup things + * anyway, even if in that case we also do deserialization. */ + + for (i = 1; i < argc; i++) { + if (streq(argv[i], "--switched-root")) + return false; /* If we switched root, don't skip the setup. */ + else if (streq(argv[i], "--deserialize")) + found_deserialize = true; + } + + return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */ +} + +static int save_env(void) { + char **l; + + l = strv_copy(environ); + if (!l) + return -ENOMEM; + + strv_free_and_replace(saved_env, l); + return 0; +} + +int main(int argc, char *argv[]) { + + dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL, + security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL; + struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0), + saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed + * in. Note we use different values + * for the two that indicate whether + * these fields are initialized! */ + bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false; + char *switch_root_dir = NULL, *switch_root_init = NULL; + usec_t before_startup, after_startup; + static char systemd[] = "systemd"; + char timespan[FORMAT_TIMESPAN_MAX]; + const char *shutdown_verb = NULL, *error_message = NULL; + int r, retval = EXIT_FAILURE; + Manager *m = NULL; + FDSet *fds = NULL; + + /* SysV compatibility: redirect init → telinit */ + redirect_telinit(argc, argv); + + /* Take timestamps early on */ + dual_timestamp_from_monotonic(&kernel_timestamp, 0); + dual_timestamp_get(&userspace_timestamp); + + /* Figure out whether we need to do initialize the system, or if we already did that because we are + * reexecuting */ + skip_setup = early_skip_setup_check(argc, argv); + + /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent reexecution we + * are then called 'systemd'. That is confusing, hence let's call us systemd right-away. */ + program_invocation_short_name = systemd; + (void) prctl(PR_SET_NAME, systemd); + + /* Save the original command line */ + save_argc_argv(argc, argv); + + /* Save the original environment as we might need to restore it if we're requested to execute another + * system manager later. */ + r = save_env(); + if (r < 0) { + error_message = "Failed to copy environment block"; + goto finish; + } + + /* Make sure that if the user says "syslog" we actually log to the journal. */ + log_set_upgrade_syslog_to_journal(true); + + if (getpid_cached() == 1) { + /* When we run as PID 1 force system mode */ + arg_system = true; + + /* Disable the umask logic */ + umask(0); + + /* Make sure that at least initially we do not ever log to journald/syslogd, because it might not be + * activated yet (even though the log socket for it exists). */ + log_set_prohibit_ipc(true); + + /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This is + * important so that we never end up logging to any foreign stderr, for example if we have to log in a + * child process right before execve()'ing the actual binary, at a point in time where socket + * activation stderr/stdout area already set up. */ + log_set_always_reopen_console(true); + + if (detect_container() <= 0) { + + /* Running outside of a container as PID 1 */ + log_set_target(LOG_TARGET_KMSG); + log_open(); + + if (in_initrd()) + initrd_timestamp = userspace_timestamp; + + if (!skip_setup) { + r = mount_setup_early(); + if (r < 0) { + error_message = "Failed to mount early API filesystems"; + goto finish; + } + + /* Let's open the log backend a second time, in case the first time didn't + * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became + * available, and it previously wasn't. */ + log_open(); + + disable_printk_ratelimit(); + + r = initialize_security( + &loaded_policy, + &security_start_timestamp, + &security_finish_timestamp, + &error_message); + if (r < 0) + goto finish; + } + + if (mac_selinux_init() < 0) { + error_message = "Failed to initialize SELinux support"; + goto finish; + } + + if (!skip_setup) + initialize_clock(); + + /* Set the default for later on, but don't actually open the logs like this for now. Note that + * if we are transitioning from the initrd there might still be journal fd open, and we + * shouldn't attempt opening that before we parsed /proc/cmdline which might redirect output + * elsewhere. */ + log_set_target(LOG_TARGET_JOURNAL_OR_KMSG); + + } else { + /* Running inside a container, as PID 1 */ + log_set_target(LOG_TARGET_CONSOLE); + log_open(); + + /* For later on, see above... */ + log_set_target(LOG_TARGET_JOURNAL); + + /* clear the kernel timestamp, because we are in a container */ + kernel_timestamp = DUAL_TIMESTAMP_NULL; + } + + initialize_coredump(skip_setup); + + r = fixup_environment(); + if (r < 0) { + log_emergency_errno(r, "Failed to fix up PID 1 environment: %m"); + error_message = "Failed to fix up PID1 environment"; + goto finish; + } + + /* Try to figure out if we can use colors with the console. No need to do that for user instances since + * they never log into the console. */ + log_show_color(colors_enabled()); + + r = make_null_stdio(); + if (r < 0) + log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m"); + + /* Load the kernel modules early. */ + if (!skip_setup) + kmod_setup(); + + /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */ + r = mount_setup(loaded_policy, skip_setup); + if (r < 0) { + error_message = "Failed to mount API filesystems"; + goto finish; + } + + /* The efivarfs is now mounted, let's read the random seed off it */ + (void) efi_take_random_seed(); + + /* Cache command-line options passed from EFI variables */ + if (!skip_setup) + (void) cache_efi_options_variable(); + } else { + /* Running as user instance */ + arg_system = false; + log_set_target(LOG_TARGET_AUTO); + log_open(); + + /* clear the kernel timestamp, because we are not PID 1 */ + kernel_timestamp = DUAL_TIMESTAMP_NULL; + + if (mac_selinux_init() < 0) { + error_message = "Failed to initialize SELinux support"; + goto finish; + } + } + + /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when + * transitioning from the initrd to the main systemd or suchlike. */ + save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock); + + /* Reset all signal handlers. */ + (void) reset_all_signal_handlers(); + (void) ignore_signals(SIGNALS_IGNORE, -1); + + (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock); + + r = parse_argv(argc, argv); + if (r < 0) { + error_message = "Failed to parse commandline arguments"; + goto finish; + } + + r = safety_checks(); + if (r < 0) + goto finish; + + if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT)) + (void) pager_open(arg_pager_flags); + + if (arg_action != ACTION_RUN) + skip_setup = true; + + if (arg_action == ACTION_HELP) { + retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_VERSION) { + retval = version(); + goto finish; + } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) { + unit_dump_config_items(stdout); + retval = EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) { + dump_bus_properties(stdout); + retval = EXIT_SUCCESS; + goto finish; + } else if (arg_action == ACTION_BUS_INTROSPECT) { + r = bus_manager_introspect_implementations(stdout, arg_bus_introspect); + retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE; + goto finish; + } + + assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST)); + + /* Move out of the way, so that we won't block unmounts */ + assert_se(chdir("/") == 0); + + if (arg_action == ACTION_RUN) { + if (!skip_setup) { + /* Apply the systemd.clock_usec= kernel command line switch */ + apply_clock_update(); + + /* Apply random seed from kernel command line */ + cmdline_take_random_seed(); + } + + /* A core pattern might have been specified via the cmdline. */ + initialize_core_pattern(skip_setup); + + /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */ + log_close(); + + /* Remember open file descriptors for later deserialization */ + r = collect_fds(&fds, &error_message); + if (r < 0) + goto finish; + + /* Give up any control of the console, but make sure its initialized. */ + setup_console_terminal(skip_setup); + + /* Open the logging devices, if possible and necessary */ + log_open(); + } + + log_execution_mode(&first_boot); + + r = initialize_runtime(skip_setup, + first_boot, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + &error_message); + if (r < 0) + goto finish; + + r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER, + arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0, + &m); + if (r < 0) { + log_emergency_errno(r, "Failed to allocate manager object: %m"); + error_message = "Failed to allocate manager object"; + goto finish; + } + + m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp; + m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp; + m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp; + m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp; + + set_manager_defaults(m); + set_manager_settings(m); + manager_set_first_boot(m, first_boot); + + /* Remember whether we should queue the default job */ + queue_default_job = !arg_serialization || arg_switched_root; + + before_startup = now(CLOCK_MONOTONIC); + + r = manager_startup(m, arg_serialization, fds); + if (r < 0) { + error_message = "Failed to start up manager"; + goto finish; + } + + /* This will close all file descriptors that were opened, but not claimed by any unit. */ + fds = fdset_free(fds); + arg_serialization = safe_fclose(arg_serialization); + + if (queue_default_job) { + r = do_queue_default_job(m, &error_message); + if (r < 0) + goto finish; + } + + after_startup = now(CLOCK_MONOTONIC); + + log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG, + "Loaded units and determined initial transaction in %s.", + format_timespan(timespan, sizeof(timespan), after_startup - before_startup, 100 * USEC_PER_MSEC)); + + if (arg_action == ACTION_TEST) { + test_summary(m); + retval = EXIT_SUCCESS; + goto finish; + } + + (void) invoke_main_loop(m, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + &reexecute, + &retval, + &shutdown_verb, + &fds, + &switch_root_dir, + &switch_root_init, + &error_message); + +finish: + pager_close(); + + if (m) { + arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT); + arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC); + m = manager_free(m); + } + + mac_selinux_finish(); + + if (reexecute) + do_reexecute(argc, argv, + &saved_rlimit_nofile, + &saved_rlimit_memlock, + fds, + switch_root_dir, + switch_root_init, + &error_message); /* This only returns if reexecution failed */ + + arg_serialization = safe_fclose(arg_serialization); + fds = fdset_free(fds); + + saved_env = strv_free(saved_env); + +#if HAVE_VALGRIND_VALGRIND_H + /* If we are PID 1 and running under valgrind, then let's exit + * here explicitly. valgrind will only generate nice output on + * exit(), not on exec(), hence let's do the former not the + * latter here. */ + if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) { + /* Cleanup watchdog_device strings for valgrind. We need them + * in become_shutdown() so normally we cannot free them yet. */ + watchdog_free_device(); + arg_watchdog_device = mfree(arg_watchdog_device); + reset_arguments(); + return retval; + } +#endif + +#if HAS_FEATURE_ADDRESS_SANITIZER + __lsan_do_leak_check(); +#endif + + if (shutdown_verb) { + r = become_shutdown(shutdown_verb, retval); + log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting"); + error_message = "Failed to execute shutdown binary"; + } + + watchdog_free_device(); + arg_watchdog_device = mfree(arg_watchdog_device); + + if (getpid_cached() == 1) { + if (error_message) + manager_status_printf(NULL, STATUS_TYPE_EMERGENCY, + ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL, + "%s.", error_message); + freeze_or_exit_or_reboot(); + } + + reset_arguments(); + return retval; +} diff --git a/src/core/manager.c b/src/core/manager.c new file mode 100644 index 0000000..a1d6f7c --- /dev/null +++ b/src/core/manager.c @@ -0,0 +1,4975 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <linux/kd.h> +#include <sys/epoll.h> +#include <sys/inotify.h> +#include <sys/ioctl.h> +#include <sys/reboot.h> +#include <sys/timerfd.h> +#include <sys/wait.h> +#include <unistd.h> + +#if HAVE_AUDIT +#include <libaudit.h> +#endif + +#include "sd-daemon.h" +#include "sd-messages.h" +#include "sd-path.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "audit-fd.h" +#include "boot-timestamps.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-util.h" +#include "clean-ipc.h" +#include "clock-util.h" +#include "core-varlink.h" +#include "dbus-job.h" +#include "dbus-manager.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "def.h" +#include "dirent-util.h" +#include "env-util.h" +#include "escape.h" +#include "exec-util.h" +#include "execute.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "generator-setup.h" +#include "hashmap.h" +#include "install.h" +#include "io-util.h" +#include "label.h" +#include "locale-setup.h" +#include "load-fragment.h" +#include "log.h" +#include "macro.h" +#include "manager.h" +#include "memory-util.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "process-util.h" +#include "ratelimit.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "serialize.h" +#include "signal-util.h" +#include "socket-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "sysctl-util.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "time-util.h" +#include "transaction.h" +#include "umask-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "virt.h" +#include "watchdog.h" + +#define NOTIFY_RCVBUF_SIZE (8*1024*1024) +#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) + +/* Initial delay and the interval for printing status messages about running jobs */ +#define JOBS_IN_PROGRESS_WAIT_USEC (2*USEC_PER_SEC) +#define JOBS_IN_PROGRESS_QUIET_WAIT_USEC (25*USEC_PER_SEC) +#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3) +#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 + +/* If there are more than 1K bus messages queue across our API and direct buses, then let's not add more on top until + * the queue gets more empty. */ +#define MANAGER_BUS_BUSY_THRESHOLD 1024LU + +/* How many units and jobs to process of the bus queue before returning to the event loop. */ +#define MANAGER_BUS_MESSAGE_BUDGET 100U + +static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); +static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); +static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); +static int manager_dispatch_timezone_change(sd_event_source *source, const struct inotify_event *event, void *userdata); +static int manager_run_environment_generators(Manager *m); +static int manager_run_generators(Manager *m); +static void manager_vacuum(Manager *m); + +static usec_t manager_watch_jobs_next_time(Manager *m) { + return usec_add(now(CLOCK_MONOTONIC), + show_status_on(m->show_status) ? JOBS_IN_PROGRESS_WAIT_USEC : + JOBS_IN_PROGRESS_QUIET_WAIT_USEC); +} + +static void manager_watch_jobs_in_progress(Manager *m) { + usec_t next; + int r; + + assert(m); + + /* We do not want to show the cylon animation if the user + * needs to confirm service executions otherwise confirmation + * messages will be screwed by the cylon animation. */ + if (!manager_is_confirm_spawn_disabled(m)) + return; + + if (m->jobs_in_progress_event_source) + return; + + next = manager_watch_jobs_next_time(m); + r = sd_event_add_time( + m->event, + &m->jobs_in_progress_event_source, + CLOCK_MONOTONIC, + next, 0, + manager_dispatch_jobs_in_progress, m); + if (r < 0) + return; + + (void) sd_event_source_set_description(m->jobs_in_progress_event_source, "manager-jobs-in-progress"); +} + +#define CYLON_BUFFER_EXTRA (2*STRLEN(ANSI_RED) + STRLEN(ANSI_HIGHLIGHT_RED) + 2*STRLEN(ANSI_NORMAL)) + +static void draw_cylon(char buffer[], size_t buflen, unsigned width, unsigned pos) { + char *p = buffer; + + assert(buflen >= CYLON_BUFFER_EXTRA + width + 1); + assert(pos <= width+1); /* 0 or width+1 mean that the center light is behind the corner */ + + if (pos > 1) { + if (pos > 2) + p = mempset(p, ' ', pos-2); + if (log_get_show_color()) + p = stpcpy(p, ANSI_RED); + *p++ = '*'; + } + + if (pos > 0 && pos <= width) { + if (log_get_show_color()) + p = stpcpy(p, ANSI_HIGHLIGHT_RED); + *p++ = '*'; + } + + if (log_get_show_color()) + p = stpcpy(p, ANSI_NORMAL); + + if (pos < width) { + if (log_get_show_color()) + p = stpcpy(p, ANSI_RED); + *p++ = '*'; + if (pos < width-1) + p = mempset(p, ' ', width-1-pos); + if (log_get_show_color()) + strcpy(p, ANSI_NORMAL); + } +} + +static void manager_flip_auto_status(Manager *m, bool enable, const char *reason) { + assert(m); + + if (enable) { + if (m->show_status == SHOW_STATUS_AUTO) + manager_set_show_status(m, SHOW_STATUS_TEMPORARY, reason); + } else { + if (m->show_status == SHOW_STATUS_TEMPORARY) + manager_set_show_status(m, SHOW_STATUS_AUTO, reason); + } +} + +static void manager_print_jobs_in_progress(Manager *m) { + _cleanup_free_ char *job_of_n = NULL; + Job *j; + unsigned counter = 0, print_nr; + char cylon[6 + CYLON_BUFFER_EXTRA + 1]; + unsigned cylon_pos; + char time[FORMAT_TIMESPAN_MAX], limit[FORMAT_TIMESPAN_MAX] = "no limit"; + uint64_t x; + + assert(m); + assert(m->n_running_jobs > 0); + + manager_flip_auto_status(m, true, "delay"); + + print_nr = (m->jobs_in_progress_iteration / JOBS_IN_PROGRESS_PERIOD_DIVISOR) % m->n_running_jobs; + + HASHMAP_FOREACH(j, m->jobs) + if (j->state == JOB_RUNNING && counter++ == print_nr) + break; + + /* m->n_running_jobs must be consistent with the contents of m->jobs, + * so the above loop must have succeeded in finding j. */ + assert(counter == print_nr + 1); + assert(j); + + cylon_pos = m->jobs_in_progress_iteration % 14; + if (cylon_pos >= 8) + cylon_pos = 14 - cylon_pos; + draw_cylon(cylon, sizeof(cylon), 6, cylon_pos); + + m->jobs_in_progress_iteration++; + + if (m->n_running_jobs > 1) { + if (asprintf(&job_of_n, "(%u of %u) ", counter, m->n_running_jobs) < 0) + job_of_n = NULL; + } + + format_timespan(time, sizeof(time), now(CLOCK_MONOTONIC) - j->begin_usec, 1*USEC_PER_SEC); + if (job_get_timeout(j, &x) > 0) + format_timespan(limit, sizeof(limit), x - j->begin_usec, 1*USEC_PER_SEC); + + manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon, + "%sA %s job is running for %s (%s / %s)", + strempty(job_of_n), + job_type_to_string(j->type), + unit_status_string(j->unit), + time, limit); +} + +static int have_ask_password(void) { + _cleanup_closedir_ DIR *dir; + struct dirent *de; + + dir = opendir("/run/systemd/ask-password"); + if (!dir) { + if (errno == ENOENT) + return false; + else + return -errno; + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + if (startswith(de->d_name, "ask.")) + return true; + } + return false; +} + +static int manager_dispatch_ask_password_fd(sd_event_source *source, + int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(m); + + (void) flush_fd(fd); + + m->have_ask_password = have_ask_password(); + if (m->have_ask_password < 0) + /* Log error but continue. Negative have_ask_password + * is treated as unknown status. */ + log_error_errno(m->have_ask_password, "Failed to list /run/systemd/ask-password: %m"); + + return 0; +} + +static void manager_close_ask_password(Manager *m) { + assert(m); + + m->ask_password_event_source = sd_event_source_unref(m->ask_password_event_source); + m->ask_password_inotify_fd = safe_close(m->ask_password_inotify_fd); + m->have_ask_password = -EINVAL; +} + +static int manager_check_ask_password(Manager *m) { + int r; + + assert(m); + + if (!m->ask_password_event_source) { + assert(m->ask_password_inotify_fd < 0); + + (void) mkdir_p_label("/run/systemd/ask-password", 0755); + + m->ask_password_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->ask_password_inotify_fd < 0) + return log_error_errno(errno, "Failed to create inotify object: %m"); + + r = inotify_add_watch_and_warn(m->ask_password_inotify_fd, + "/run/systemd/ask-password", + IN_CREATE|IN_DELETE|IN_MOVE); + if (r < 0) { + manager_close_ask_password(m); + return r; + } + + r = sd_event_add_io(m->event, &m->ask_password_event_source, + m->ask_password_inotify_fd, EPOLLIN, + manager_dispatch_ask_password_fd, m); + if (r < 0) { + log_error_errno(r, "Failed to add event source for /run/systemd/ask-password: %m"); + manager_close_ask_password(m); + return r; + } + + (void) sd_event_source_set_description(m->ask_password_event_source, "manager-ask-password"); + + /* Queries might have been added meanwhile... */ + manager_dispatch_ask_password_fd(m->ask_password_event_source, + m->ask_password_inotify_fd, EPOLLIN, m); + } + + return m->have_ask_password; +} + +static int manager_watch_idle_pipe(Manager *m) { + int r; + + assert(m); + + if (m->idle_pipe_event_source) + return 0; + + if (m->idle_pipe[2] < 0) + return 0; + + r = sd_event_add_io(m->event, &m->idle_pipe_event_source, m->idle_pipe[2], EPOLLIN, manager_dispatch_idle_pipe_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to watch idle pipe: %m"); + + (void) sd_event_source_set_description(m->idle_pipe_event_source, "manager-idle-pipe"); + + return 0; +} + +static void manager_close_idle_pipe(Manager *m) { + assert(m); + + m->idle_pipe_event_source = sd_event_source_unref(m->idle_pipe_event_source); + + safe_close_pair(m->idle_pipe); + safe_close_pair(m->idle_pipe + 2); +} + +static int manager_setup_time_change(Manager *m) { + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + m->time_change_event_source = sd_event_source_unref(m->time_change_event_source); + m->time_change_fd = safe_close(m->time_change_fd); + + m->time_change_fd = time_change_fd(); + if (m->time_change_fd < 0) + return log_error_errno(m->time_change_fd, "Failed to create timer change timer fd: %m"); + + r = sd_event_add_io(m->event, &m->time_change_event_source, m->time_change_fd, EPOLLIN, manager_dispatch_time_change_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to create time change event source: %m"); + + /* Schedule this slightly earlier than the .timer event sources */ + r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1); + if (r < 0) + return log_error_errno(r, "Failed to set priority of time change event sources: %m"); + + (void) sd_event_source_set_description(m->time_change_event_source, "manager-time-change"); + + log_debug("Set up TFD_TIMER_CANCEL_ON_SET timerfd."); + + return 0; +} + +static int manager_read_timezone_stat(Manager *m) { + struct stat st; + bool changed; + + assert(m); + + /* Read the current stat() data of /etc/localtime so that we detect changes */ + if (lstat("/etc/localtime", &st) < 0) { + log_debug_errno(errno, "Failed to stat /etc/localtime, ignoring: %m"); + changed = m->etc_localtime_accessible; + m->etc_localtime_accessible = false; + } else { + usec_t k; + + k = timespec_load(&st.st_mtim); + changed = !m->etc_localtime_accessible || k != m->etc_localtime_mtime; + + m->etc_localtime_mtime = k; + m->etc_localtime_accessible = true; + } + + return changed; +} + +static int manager_setup_timezone_change(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *new_event = NULL; + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + /* We watch /etc/localtime for three events: change of the link count (which might mean removal from /etc even + * though another link might be kept), renames, and file close operations after writing. Note we don't bother + * with IN_DELETE_SELF, as that would just report when the inode is removed entirely, i.e. after the link count + * went to zero and all fds to it are closed. + * + * Note that we never follow symlinks here. This is a simplification, but should cover almost all cases + * correctly. + * + * Note that we create the new event source first here, before releasing the old one. This should optimize + * behaviour as this way sd-event can reuse the old watch in case the inode didn't change. */ + + r = sd_event_add_inotify(m->event, &new_event, "/etc/localtime", + IN_ATTRIB|IN_MOVE_SELF|IN_CLOSE_WRITE|IN_DONT_FOLLOW, manager_dispatch_timezone_change, m); + if (r == -ENOENT) { + /* If the file doesn't exist yet, subscribe to /etc instead, and wait until it is created either by + * O_CREATE or by rename() */ + + log_debug_errno(r, "/etc/localtime doesn't exist yet, watching /etc instead."); + r = sd_event_add_inotify(m->event, &new_event, "/etc", + IN_CREATE|IN_MOVED_TO|IN_ONLYDIR, manager_dispatch_timezone_change, m); + } + if (r < 0) + return log_error_errno(r, "Failed to create timezone change event source: %m"); + + /* Schedule this slightly earlier than the .timer event sources */ + r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1); + if (r < 0) + return log_error_errno(r, "Failed to set priority of timezone change event sources: %m"); + + sd_event_source_unref(m->timezone_change_event_source); + m->timezone_change_event_source = TAKE_PTR(new_event); + + return 0; +} + +static int enable_special_signals(Manager *m) { + _cleanup_close_ int fd = -1; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + /* Enable that we get SIGINT on control-alt-del. In containers + * this will fail with EPERM (older) or EINVAL (newer), so + * ignore that. */ + if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL)) + log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m"); + + fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) { + /* Support systems without virtual console */ + if (fd != -ENOENT) + log_warning_errno(errno, "Failed to open /dev/tty0: %m"); + } else { + /* Enable that we get SIGWINCH on kbrequest */ + if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0) + log_warning_errno(errno, "Failed to enable kbrequest handling: %m"); + } + + return 0; +} + +#define RTSIG_IF_AVAILABLE(signum) (signum <= SIGRTMAX ? signum : -1) + +static int manager_setup_signals(Manager *m) { + struct sigaction sa = { + .sa_handler = SIG_DFL, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + sigset_t mask; + int r; + + assert(m); + + assert_se(sigaction(SIGCHLD, &sa, NULL) == 0); + + /* We make liberal use of realtime signals here. On + * Linux/glibc we have 30 of them (with the exception of Linux + * on hppa, see below), between SIGRTMIN+0 ... SIGRTMIN+30 + * (aka SIGRTMAX). */ + + assert_se(sigemptyset(&mask) == 0); + sigset_add_many(&mask, + SIGCHLD, /* Child died */ + SIGTERM, /* Reexecute daemon */ + SIGHUP, /* Reload configuration */ + SIGUSR1, /* systemd/upstart: reconnect to D-Bus */ + SIGUSR2, /* systemd: dump status */ + SIGINT, /* Kernel sends us this on control-alt-del */ + SIGWINCH, /* Kernel sends us this on kbrequest (alt-arrowup) */ + SIGPWR, /* Some kernel drivers and upsd send us this on power failure */ + + SIGRTMIN+0, /* systemd: start default.target */ + SIGRTMIN+1, /* systemd: isolate rescue.target */ + SIGRTMIN+2, /* systemd: isolate emergency.target */ + SIGRTMIN+3, /* systemd: start halt.target */ + SIGRTMIN+4, /* systemd: start poweroff.target */ + SIGRTMIN+5, /* systemd: start reboot.target */ + SIGRTMIN+6, /* systemd: start kexec.target */ + + /* ... space for more special targets ... */ + + SIGRTMIN+13, /* systemd: Immediate halt */ + SIGRTMIN+14, /* systemd: Immediate poweroff */ + SIGRTMIN+15, /* systemd: Immediate reboot */ + SIGRTMIN+16, /* systemd: Immediate kexec */ + + /* ... space for more immediate system state changes ... */ + + SIGRTMIN+20, /* systemd: enable status messages */ + SIGRTMIN+21, /* systemd: disable status messages */ + SIGRTMIN+22, /* systemd: set log level to LOG_DEBUG */ + SIGRTMIN+23, /* systemd: set log level to LOG_INFO */ + SIGRTMIN+24, /* systemd: Immediate exit (--user only) */ + + /* .. one free signal here ... */ + + /* Apparently Linux on hppa had fewer RT signals until v3.18, + * SIGRTMAX was SIGRTMIN+25, and then SIGRTMIN was lowered, + * see commit v3.17-7614-g1f25df2eff. + * + * We cannot unconditionally make use of those signals here, + * so let's use a runtime check. Since these commands are + * accessible by different means and only really a safety + * net, the missing functionality on hppa shouldn't matter. + */ + + RTSIG_IF_AVAILABLE(SIGRTMIN+26), /* systemd: set log target to journal-or-kmsg */ + RTSIG_IF_AVAILABLE(SIGRTMIN+27), /* systemd: set log target to console */ + RTSIG_IF_AVAILABLE(SIGRTMIN+28), /* systemd: set log target to kmsg */ + RTSIG_IF_AVAILABLE(SIGRTMIN+29), /* systemd: set log target to syslog-or-kmsg (obsolete) */ + + /* ... one free signal here SIGRTMIN+30 ... */ + -1); + assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + + m->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC); + if (m->signal_fd < 0) + return -errno; + + r = sd_event_add_io(m->event, &m->signal_event_source, m->signal_fd, EPOLLIN, manager_dispatch_signal_fd, m); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->signal_event_source, "manager-signal"); + + /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the + * notify processing can still figure out to which process/service a message belongs, before we reap the + * process. Also, process this before handling cgroup notifications, so that we always collect child exit + * status information before detecting that there's no process in a cgroup. */ + r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(m)) + return enable_special_signals(m); + + return 0; +} + +static char** sanitize_environment(char **l) { + + /* Let's remove some environment variables that we need ourselves to communicate with our clients */ + strv_env_unset_many( + l, + "CACHE_DIRECTORY", + "CONFIGURATION_DIRECTORY", + "CREDENTIALS_DIRECTORY", + "EXIT_CODE", + "EXIT_STATUS", + "INVOCATION_ID", + "JOURNAL_STREAM", + "LISTEN_FDNAMES", + "LISTEN_FDS", + "LISTEN_PID", + "LOGS_DIRECTORY", + "MAINPID", + "MANAGERPID", + "NOTIFY_SOCKET", + "PIDFILE", + "REMOTE_ADDR", + "REMOTE_PORT", + "RUNTIME_DIRECTORY", + "SERVICE_RESULT", + "STATE_DIRECTORY", + "WATCHDOG_PID", + "WATCHDOG_USEC", + NULL); + + /* Let's order the environment alphabetically, just to make it pretty */ + strv_sort(l); + + return l; +} + +int manager_default_environment(Manager *m) { + int r; + + assert(m); + + m->transient_environment = strv_free(m->transient_environment); + + if (MANAGER_IS_SYSTEM(m)) { + /* The system manager always starts with a clean + * environment for its children. It does not import + * the kernel's or the parents' exported variables. + * + * The initial passed environment is untouched to keep + * /proc/self/environ valid; it is used for tagging + * the init process inside containers. */ + m->transient_environment = strv_new("PATH=" DEFAULT_PATH); + if (!m->transient_environment) + return log_oom(); + + /* Import locale variables LC_*= from configuration */ + (void) locale_setup(&m->transient_environment); + } else { + _cleanup_free_ char *k = NULL; + + /* The user manager passes its own environment + * along to its children, except for $PATH. */ + m->transient_environment = strv_copy(environ); + if (!m->transient_environment) + return log_oom(); + + k = strdup("PATH=" DEFAULT_USER_PATH); + if (!k) + return log_oom(); + + r = strv_env_replace(&m->transient_environment, k); + if (r < 0) + return log_oom(); + TAKE_PTR(k); + } + + sanitize_environment(m->transient_environment); + + return 0; +} + +static int manager_setup_prefix(Manager *m) { + struct table_entry { + uint64_t type; + const char *suffix; + }; + + static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL }, + [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL }, + }; + + static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_CONFIGURATION, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_CONFIGURATION, "log" }, + [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL }, + }; + + assert(m); + + const struct table_entry *p = MANAGER_IS_SYSTEM(m) ? paths_system : paths_user; + int r; + + for (ExecDirectoryType i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) { + r = sd_path_lookup(p[i].type, p[i].suffix, &m->prefix[i]); + if (r < 0) + return r; + } + + return 0; +} + +static void manager_free_unit_name_maps(Manager *m) { + m->unit_id_map = hashmap_free(m->unit_id_map); + m->unit_name_map = hashmap_free(m->unit_name_map); + m->unit_path_cache = set_free(m->unit_path_cache); + m->unit_cache_timestamp_hash = 0; +} + +static int manager_setup_run_queue(Manager *m) { + int r; + + assert(m); + assert(!m->run_queue_event_source); + + r = sd_event_add_defer(m->event, &m->run_queue_event_source, manager_dispatch_run_queue, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(m->run_queue_event_source, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->run_queue_event_source, "manager-run-queue"); + + return 0; +} + +static int manager_setup_sigchld_event_source(Manager *m) { + int r; + + assert(m); + assert(!m->sigchld_event_source); + + r = sd_event_add_defer(m->event, &m->sigchld_event_source, manager_dispatch_sigchld, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->sigchld_event_source, "manager-sigchld"); + + return 0; +} + +int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager **_m) { + _cleanup_(manager_freep) Manager *m = NULL; + const char *e; + int r; + + assert(_m); + assert(IN_SET(scope, UNIT_FILE_SYSTEM, UNIT_FILE_USER)); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .unit_file_scope = scope, + .objective = _MANAGER_OBJECTIVE_INVALID, + + .status_unit_format = STATUS_UNIT_FORMAT_DEFAULT, + + .default_timer_accuracy_usec = USEC_PER_MINUTE, + .default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT, + .default_tasks_accounting = true, + .default_tasks_max = TASKS_MAX_UNSET, + .default_timeout_start_usec = DEFAULT_TIMEOUT_USEC, + .default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC, + .default_restart_usec = DEFAULT_RESTART_USEC, + + .original_log_level = -1, + .original_log_target = _LOG_TARGET_INVALID, + + .watchdog_overridden[WATCHDOG_RUNTIME] = USEC_INFINITY, + .watchdog_overridden[WATCHDOG_REBOOT] = USEC_INFINITY, + .watchdog_overridden[WATCHDOG_KEXEC] = USEC_INFINITY, + + .show_status_overridden = _SHOW_STATUS_INVALID, + + .notify_fd = -1, + .cgroups_agent_fd = -1, + .signal_fd = -1, + .time_change_fd = -1, + .user_lookup_fds = { -1, -1 }, + .private_listen_fd = -1, + .dev_autofs_fd = -1, + .cgroup_inotify_fd = -1, + .pin_cgroupfs_fd = -1, + .ask_password_inotify_fd = -1, + .idle_pipe = { -1, -1, -1, -1}, + + /* start as id #1, so that we can leave #0 around as "null-like" value */ + .current_job_id = 1, + + .have_ask_password = -EINVAL, /* we don't know */ + .first_boot = -1, + .test_run_flags = test_run_flags, + + .default_oom_policy = OOM_STOP, + }; + +#if ENABLE_EFI + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) + boot_timestamps(m->timestamps + MANAGER_TIMESTAMP_USERSPACE, + m->timestamps + MANAGER_TIMESTAMP_FIRMWARE, + m->timestamps + MANAGER_TIMESTAMP_LOADER); +#endif + + /* Prepare log fields we can use for structured logging */ + if (MANAGER_IS_SYSTEM(m)) { + m->unit_log_field = "UNIT="; + m->unit_log_format_string = "UNIT=%s"; + + m->invocation_log_field = "INVOCATION_ID="; + m->invocation_log_format_string = "INVOCATION_ID=%s"; + } else { + m->unit_log_field = "USER_UNIT="; + m->unit_log_format_string = "USER_UNIT=%s"; + + m->invocation_log_field = "USER_INVOCATION_ID="; + m->invocation_log_format_string = "USER_INVOCATION_ID=%s"; + } + + /* Reboot immediately if the user hits C-A-D more often than 7x per 2s */ + m->ctrl_alt_del_ratelimit = (RateLimit) { .interval = 2 * USEC_PER_SEC, .burst = 7 }; + + r = manager_default_environment(m); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->units, &string_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->cgroup_unit, &path_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&m->watch_bus, &string_hash_ops); + if (r < 0) + return r; + + r = prioq_ensure_allocated(&m->run_queue, compare_job_priority); + if (r < 0) + return r; + + r = manager_setup_prefix(m); + if (r < 0) + return r; + + e = secure_getenv("CREDENTIALS_DIRECTORY"); + if (e) { + m->received_credentials = strdup(e); + if (!m->received_credentials) + return -ENOMEM; + } + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + r = manager_setup_run_queue(m); + if (r < 0) + return r; + + if (test_run_flags == MANAGER_TEST_RUN_MINIMAL) { + m->cgroup_root = strdup(""); + if (!m->cgroup_root) + return -ENOMEM; + } else { + r = manager_setup_signals(m); + if (r < 0) + return r; + + r = manager_setup_cgroup(m); + if (r < 0) + return r; + + r = manager_setup_time_change(m); + if (r < 0) + return r; + + r = manager_read_timezone_stat(m); + if (r < 0) + return r; + + (void) manager_setup_timezone_change(m); + + r = manager_setup_sigchld_event_source(m); + if (r < 0) + return r; + } + + if (test_run_flags == 0) { + if (MANAGER_IS_SYSTEM(m)) + r = mkdir_label("/run/systemd/units", 0755); + else { + _cleanup_free_ char *units_path = NULL; + r = xdg_user_runtime_dir(&units_path, "/systemd/units"); + if (r < 0) + return r; + r = mkdir_p_label(units_path, 0755); + } + + if (r < 0 && r != -EEXIST) + return r; + } + + m->taint_usr = + !in_initrd() && + dir_is_empty("/usr") > 0; + + /* Note that we do not set up the notify fd here. We do that after deserialization, + * since they might have gotten serialized across the reexec. */ + + *_m = TAKE_PTR(m); + + return 0; +} + +static int manager_setup_notify(Manager *m) { + int r; + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + if (m->notify_fd < 0) { + _cleanup_close_ int fd = -1; + union sockaddr_union sa; + socklen_t sa_len; + + /* First free all secondary fields */ + m->notify_socket = mfree(m->notify_socket); + m->notify_event_source = sd_event_source_unref(m->notify_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate notification socket: %m"); + + fd_inc_rcvbuf(fd, NOTIFY_RCVBUF_SIZE); + + m->notify_socket = path_join(m->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/notify"); + if (!m->notify_socket) + return log_oom(); + + r = sockaddr_un_set_path(&sa.un, m->notify_socket); + if (r < 0) + return log_error_errno(r, "Notify socket '%s' not valid for AF_UNIX socket address, refusing.", + m->notify_socket); + sa_len = r; + + (void) mkdir_parents_label(m->notify_socket, 0755); + (void) sockaddr_un_unlink(&sa.un); + + r = mac_selinux_bind(fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(r, "bind(%s) failed: %m", m->notify_socket); + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + m->notify_fd = TAKE_FD(fd); + + log_debug("Using notification socket %s", m->notify_socket); + } + + if (!m->notify_event_source) { + r = sd_event_add_io(m->event, &m->notify_event_source, m->notify_fd, EPOLLIN, manager_dispatch_notify_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + + /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which + * service an exit message belongs. */ + r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) + return log_error_errno(r, "Failed to set priority of notify event source: %m"); + + (void) sd_event_source_set_description(m->notify_event_source, "manager-notify"); + } + + return 0; +} + +static int manager_setup_cgroups_agent(Manager *m) { + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/cgroups-agent", + }; + int r; + + /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering + * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and + * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on + * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number + * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, + * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX + * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and + * we thus won't lose messages. + * + * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen + * to it. The system instance hence listens on this special socket, but the user instances listen on the system + * bus for these messages. */ + + if (MANAGER_IS_TEST_RUN(m)) + return 0; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Failed to determine whether unified cgroups hierarchy is used: %m"); + if (r > 0) /* We don't need this anymore on the unified hierarchy */ + return 0; + + if (m->cgroups_agent_fd < 0) { + _cleanup_close_ int fd = -1; + + /* First free all secondary fields */ + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); + + fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); + + (void) sockaddr_un_unlink(&sa.un); + + /* Only allow root to connect to this socket */ + RUN_WITH_UMASK(0077) + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + m->cgroups_agent_fd = TAKE_FD(fd); + } + + if (!m->cgroups_agent_event_source) { + r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); + + /* Process cgroups notifications early. Note that when the agent notification is received + * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than + * that. Also see handling of cgroup inotify for the unified cgroup stuff. */ + r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); + + (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); + } + + return 0; +} + +static int manager_setup_user_lookup_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID + * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation, + * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked + * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects + * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes, + * hence we establish this communication channel so that forked off processes can pass their UID/GID + * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple + * datagram, along with their unit name, so that we can share one communication socket pair among all units for + * this purpose. + * + * You might wonder why we need a communication channel for this that is independent of the usual notification + * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET + * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user + * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available. + * + * Note that this function is called under two circumstances: when we first initialize (in which case we + * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload + * (in which case the socket pair already exists but we still need to allocate the event source for it). */ + + if (m->user_lookup_fds[0] < 0) { + + /* Free all secondary fields */ + safe_close_pair(m->user_lookup_fds); + m->user_lookup_event_source = sd_event_source_unref(m->user_lookup_event_source); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0) + return log_error_errno(errno, "Failed to allocate user lookup socket: %m"); + + (void) fd_inc_rcvbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE); + } + + if (!m->user_lookup_event_source) { + r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m); + if (r < 0) + return log_error_errno(errno, "Failed to allocate user lookup event source: %m"); + + /* Process even earlier than the notify event source, so that we always know first about valid UID/GID + * resolutions */ + r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11); + if (r < 0) + return log_error_errno(errno, "Failed to set priority of user lookup event source: %m"); + + (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup"); + } + + return 0; +} + +static unsigned manager_dispatch_cleanup_queue(Manager *m) { + Unit *u; + unsigned n = 0; + + assert(m); + + while ((u = m->cleanup_queue)) { + assert(u->in_cleanup_queue); + + unit_free(u); + n++; + } + + return n; +} + +enum { + GC_OFFSET_IN_PATH, /* This one is on the path we were traveling */ + GC_OFFSET_UNSURE, /* No clue */ + GC_OFFSET_GOOD, /* We still need this unit */ + GC_OFFSET_BAD, /* We don't need this unit anymore */ + _GC_OFFSET_MAX +}; + +static void unit_gc_mark_good(Unit *u, unsigned gc_marker) { + Unit *other; + void *v; + + u->gc_marker = gc_marker + GC_OFFSET_GOOD; + + /* Recursively mark referenced units as GOOD as well */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REFERENCES]) + if (other->gc_marker == gc_marker + GC_OFFSET_UNSURE) + unit_gc_mark_good(other, gc_marker); +} + +static void unit_gc_sweep(Unit *u, unsigned gc_marker) { + Unit *other; + bool is_bad; + void *v; + + assert(u); + + if (IN_SET(u->gc_marker - gc_marker, + GC_OFFSET_GOOD, GC_OFFSET_BAD, GC_OFFSET_UNSURE, GC_OFFSET_IN_PATH)) + return; + + if (u->in_cleanup_queue) + goto bad; + + if (!unit_may_gc(u)) + goto good; + + u->gc_marker = gc_marker + GC_OFFSET_IN_PATH; + + is_bad = true; + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REFERENCED_BY]) { + unit_gc_sweep(other, gc_marker); + + if (other->gc_marker == gc_marker + GC_OFFSET_GOOD) + goto good; + + if (other->gc_marker != gc_marker + GC_OFFSET_BAD) + is_bad = false; + } + + if (u->refs_by_target) { + const UnitRef *ref; + + LIST_FOREACH(refs_by_target, ref, u->refs_by_target) { + unit_gc_sweep(ref->source, gc_marker); + + if (ref->source->gc_marker == gc_marker + GC_OFFSET_GOOD) + goto good; + + if (ref->source->gc_marker != gc_marker + GC_OFFSET_BAD) + is_bad = false; + } + } + + if (is_bad) + goto bad; + + /* We were unable to find anything out about this entry, so + * let's investigate it later */ + u->gc_marker = gc_marker + GC_OFFSET_UNSURE; + unit_add_to_gc_queue(u); + return; + +bad: + /* We definitely know that this one is not useful anymore, so + * let's mark it for deletion */ + u->gc_marker = gc_marker + GC_OFFSET_BAD; + unit_add_to_cleanup_queue(u); + return; + +good: + unit_gc_mark_good(u, gc_marker); +} + +static unsigned manager_dispatch_gc_unit_queue(Manager *m) { + unsigned n = 0, gc_marker; + Unit *u; + + assert(m); + + /* log_debug("Running GC..."); */ + + m->gc_marker += _GC_OFFSET_MAX; + if (m->gc_marker + _GC_OFFSET_MAX <= _GC_OFFSET_MAX) + m->gc_marker = 1; + + gc_marker = m->gc_marker; + + while ((u = m->gc_unit_queue)) { + assert(u->in_gc_queue); + + unit_gc_sweep(u, gc_marker); + + LIST_REMOVE(gc_queue, m->gc_unit_queue, u); + u->in_gc_queue = false; + + n++; + + if (IN_SET(u->gc_marker - gc_marker, + GC_OFFSET_BAD, GC_OFFSET_UNSURE)) { + if (u->id) + log_unit_debug(u, "Collecting."); + u->gc_marker = gc_marker + GC_OFFSET_BAD; + unit_add_to_cleanup_queue(u); + } + } + + return n; +} + +static unsigned manager_dispatch_gc_job_queue(Manager *m) { + unsigned n = 0; + Job *j; + + assert(m); + + while ((j = m->gc_job_queue)) { + assert(j->in_gc_queue); + + LIST_REMOVE(gc_queue, m->gc_job_queue, j); + j->in_gc_queue = false; + + n++; + + if (!job_may_gc(j)) + continue; + + log_unit_debug(j->unit, "Collecting job."); + (void) job_finish_and_invalidate(j, JOB_COLLECTED, false, false); + } + + return n; +} + +static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) { + unsigned n = 0; + Unit *u; + int r; + + assert(m); + + while ((u = m->stop_when_unneeded_queue)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + assert(m->stop_when_unneeded_queue); + + assert(u->in_stop_when_unneeded_queue); + LIST_REMOVE(stop_when_unneeded_queue, m->stop_when_unneeded_queue, u); + u->in_stop_when_unneeded_queue = false; + + n++; + + if (!unit_is_unneeded(u)) + continue; + + log_unit_debug(u, "Unit is not needed anymore."); + + /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the + * service being unnecessary after a while. */ + + if (!ratelimit_below(&u->auto_stop_ratelimit)) { + log_unit_warning(u, "Unit not needed anymore, but not stopping since we tried this too often recently."); + continue; + } + + /* Ok, nobody needs us anymore. Sniff. Then let's commit suicide */ + r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r)); + } + + return n; +} + +static void manager_clear_jobs_and_units(Manager *m) { + Unit *u; + + assert(m); + + while ((u = hashmap_first(m->units))) + unit_free(u); + + manager_dispatch_cleanup_queue(m); + + assert(!m->load_queue); + assert(prioq_isempty(m->run_queue)); + assert(!m->dbus_unit_queue); + assert(!m->dbus_job_queue); + assert(!m->cleanup_queue); + assert(!m->gc_unit_queue); + assert(!m->gc_job_queue); + assert(!m->stop_when_unneeded_queue); + + assert(hashmap_isempty(m->jobs)); + assert(hashmap_isempty(m->units)); + + m->n_on_console = 0; + m->n_running_jobs = 0; + m->n_installed_jobs = 0; + m->n_failed_jobs = 0; +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + manager_clear_jobs_and_units(m); + + for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) + if (unit_vtable[c]->shutdown) + unit_vtable[c]->shutdown(m); + + /* Keep the cgroup hierarchy in place except when we know we are going down for good */ + manager_shutdown_cgroup(m, IN_SET(m->objective, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)); + + lookup_paths_flush_generator(&m->lookup_paths); + + bus_done(m); + manager_varlink_done(m); + + exec_runtime_vacuum(m); + hashmap_free(m->exec_runtime_by_id); + + dynamic_user_vacuum(m, false); + hashmap_free(m->dynamic_users); + + hashmap_free(m->units); + hashmap_free(m->units_by_invocation_id); + hashmap_free(m->jobs); + hashmap_free(m->watch_pids); + hashmap_free(m->watch_bus); + + prioq_free(m->run_queue); + + set_free(m->startup_units); + set_free(m->failed_units); + + sd_event_source_unref(m->signal_event_source); + sd_event_source_unref(m->sigchld_event_source); + sd_event_source_unref(m->notify_event_source); + sd_event_source_unref(m->cgroups_agent_event_source); + sd_event_source_unref(m->time_change_event_source); + sd_event_source_unref(m->timezone_change_event_source); + sd_event_source_unref(m->jobs_in_progress_event_source); + sd_event_source_unref(m->run_queue_event_source); + sd_event_source_unref(m->user_lookup_event_source); + + safe_close(m->signal_fd); + safe_close(m->notify_fd); + safe_close(m->cgroups_agent_fd); + safe_close(m->time_change_fd); + safe_close_pair(m->user_lookup_fds); + + manager_close_ask_password(m); + + manager_close_idle_pipe(m); + + sd_event_unref(m->event); + + free(m->notify_socket); + + lookup_paths_free(&m->lookup_paths); + strv_free(m->transient_environment); + strv_free(m->client_environment); + + hashmap_free(m->cgroup_unit); + manager_free_unit_name_maps(m); + + free(m->switch_root); + free(m->switch_root_init); + + rlimit_free_all(m->rlimit); + + assert(hashmap_isempty(m->units_requiring_mounts_for)); + hashmap_free(m->units_requiring_mounts_for); + + hashmap_free(m->uid_refs); + hashmap_free(m->gid_refs); + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) + m->prefix[dt] = mfree(m->prefix[dt]); + free(m->received_credentials); + + return mfree(m); +} + +static void manager_enumerate_perpetual(Manager *m) { + assert(m); + + if (m->test_run_flags == MANAGER_TEST_RUN_MINIMAL) + return; + + /* Let's ask every type to load all units from disk/kernel that it might know */ + for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) { + if (!unit_type_supported(c)) { + log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c)); + continue; + } + + if (unit_vtable[c]->enumerate_perpetual) + unit_vtable[c]->enumerate_perpetual(m); + } +} + +static void manager_enumerate(Manager *m) { + assert(m); + + if (m->test_run_flags == MANAGER_TEST_RUN_MINIMAL) + return; + + /* Let's ask every type to load all units from disk/kernel that it might know */ + for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) { + if (!unit_type_supported(c)) { + log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c)); + continue; + } + + if (unit_vtable[c]->enumerate) + unit_vtable[c]->enumerate(m); + } + + manager_dispatch_load_queue(m); +} + +static void manager_coldplug(Manager *m) { + Unit *u; + char *k; + int r; + + assert(m); + + log_debug("Invoking unit coldplug() handlers…"); + + /* Let's place the units back into their deserialized state */ + HASHMAP_FOREACH_KEY(u, k, m->units) { + + /* ignore aliases */ + if (u->id != k) + continue; + + r = unit_coldplug(u); + if (r < 0) + log_warning_errno(r, "We couldn't coldplug %s, proceeding anyway: %m", u->id); + } +} + +static void manager_catchup(Manager *m) { + Unit *u; + char *k; + + assert(m); + + log_debug("Invoking unit catchup() handlers…"); + + /* Let's catch up on any state changes that happened while we were reloading/reexecing */ + HASHMAP_FOREACH_KEY(u, k, m->units) { + + /* ignore aliases */ + if (u->id != k) + continue; + + unit_catchup(u); + } +} + +static void manager_distribute_fds(Manager *m, FDSet *fds) { + Unit *u; + + assert(m); + + HASHMAP_FOREACH(u, m->units) { + + if (fdset_size(fds) <= 0) + break; + + if (!UNIT_VTABLE(u)->distribute_fds) + continue; + + UNIT_VTABLE(u)->distribute_fds(u, fds); + } +} + +static bool manager_dbus_is_running(Manager *m, bool deserialized) { + Unit *u; + + assert(m); + + /* This checks whether the dbus instance we are supposed to expose our APIs on is up. We check both the socket + * and the service unit. If the 'deserialized' parameter is true we'll check the deserialized state of the unit + * rather than the current one. */ + + if (MANAGER_IS_TEST_RUN(m)) + return false; + + u = manager_get_unit(m, SPECIAL_DBUS_SOCKET); + if (!u) + return false; + if ((deserialized ? SOCKET(u)->deserialized_state : SOCKET(u)->state) != SOCKET_RUNNING) + return false; + + u = manager_get_unit(m, SPECIAL_DBUS_SERVICE); + if (!u) + return false; + if (!IN_SET((deserialized ? SERVICE(u)->deserialized_state : SERVICE(u)->state), SERVICE_RUNNING, SERVICE_RELOAD)) + return false; + + return true; +} + +static void manager_setup_bus(Manager *m) { + assert(m); + + /* Let's set up our private bus connection now, unconditionally */ + (void) bus_init_private(m); + + /* If we are in --user mode also connect to the system bus now */ + if (MANAGER_IS_USER(m)) + (void) bus_init_system(m); + + /* Let's connect to the bus now, but only if the unit is supposed to be up */ + if (manager_dbus_is_running(m, MANAGER_IS_RELOADING(m))) { + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } +} + +static void manager_preset_all(Manager *m) { + int r; + + assert(m); + + if (m->first_boot <= 0) + return; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (MANAGER_IS_TEST_RUN(m)) + return; + + /* If this is the first boot, and we are in the host system, then preset everything */ + r = unit_file_preset_all(UNIT_FILE_SYSTEM, 0, NULL, UNIT_FILE_PRESET_ENABLE_ONLY, NULL, 0); + if (r < 0) + log_full_errno(r == -EEXIST ? LOG_NOTICE : LOG_WARNING, r, + "Failed to populate /etc with preset unit settings, ignoring: %m"); + else + log_info("Populated /etc with preset unit settings."); +} + +static void manager_ready(Manager *m) { + assert(m); + + /* After having loaded everything, do the final round of catching up with what might have changed */ + + m->objective = MANAGER_OK; /* Tell everyone we are up now */ + + /* It might be safe to log to the journal now and connect to dbus */ + manager_recheck_journal(m); + manager_recheck_dbus(m); + + /* Let's finally catch up with any changes that took place while we were reloading/reexecing */ + manager_catchup(m); + + m->honor_device_enumeration = true; +} + +static Manager* manager_reloading_start(Manager *m) { + m->n_reloading++; + return m; +} +static void manager_reloading_stopp(Manager **m) { + if (*m) { + assert((*m)->n_reloading > 0); + (*m)->n_reloading--; + } +} + +int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { + int r; + + assert(m); + + /* If we are running in test mode, we still want to run the generators, + * but we should not touch the real generator directories. */ + r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, + MANAGER_IS_TEST_RUN(m) ? LOOKUP_PATHS_TEMPORARY_GENERATED : 0, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to initialize path lookup table: %m"); + + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_START)); + r = manager_run_environment_generators(m); + if (r >= 0) + r = manager_run_generators(m); + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_FINISH)); + if (r < 0) + return r; + + manager_preset_all(m); + + lookup_paths_log(&m->lookup_paths); + + { + /* This block is (optionally) done with the reloading counter bumped */ + _cleanup_(manager_reloading_stopp) Manager *reloading = NULL; + + /* If we will deserialize make sure that during enumeration this is already known, so we increase the + * counter here already */ + if (serialization) + reloading = manager_reloading_start(m); + + /* First, enumerate what we can from all config files */ + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_START)); + manager_enumerate_perpetual(m); + manager_enumerate(m); + dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_FINISH)); + + /* Second, deserialize if there is something to deserialize */ + if (serialization) { + r = manager_deserialize(m, serialization, fds); + if (r < 0) + return log_error_errno(r, "Deserialization failed: %m"); + } + + /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass + * some file descriptors to us pre-initialized. This enables socket-based activation of entire + * containers. */ + manager_distribute_fds(m, fds); + + /* We might have deserialized the notify fd, but if we didn't then let's create the bus now */ + r = manager_setup_notify(m); + if (r < 0) + /* No sense to continue without notifications, our children would fail anyway. */ + return r; + + r = manager_setup_cgroups_agent(m); + if (r < 0) + /* Likewise, no sense to continue without empty cgroup notifications. */ + return r; + + r = manager_setup_user_lookup_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + + /* Connect to the bus if we are good for it */ + manager_setup_bus(m); + + /* Now that we are connected to all possible buses, let's deserialize who is tracking us. */ + r = bus_track_coldplug(m, &m->subscribed, false, m->deserialized_subscribed); + if (r < 0) + log_warning_errno(r, "Failed to deserialized tracked clients, ignoring: %m"); + m->deserialized_subscribed = strv_free(m->deserialized_subscribed); + + r = manager_varlink_init(m); + if (r < 0) + log_warning_errno(r, "Failed to set up Varlink server, ignoring: %m"); + + /* Third, fire things up! */ + manager_coldplug(m); + + /* Clean up runtime objects */ + manager_vacuum(m); + + if (serialization) + /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the + * reload is finished */ + m->send_reloading_done = true; + } + + manager_ready(m); + + return 0; +} + +int manager_add_job( + Manager *m, + JobType type, + Unit *unit, + JobMode mode, + Set *affected_jobs, + sd_bus_error *error, + Job **ret) { + + Transaction *tr; + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(unit); + assert(mode < _JOB_MODE_MAX); + + if (mode == JOB_ISOLATE && type != JOB_START) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Isolate is only valid for start."); + + if (mode == JOB_ISOLATE && !unit->allow_isolate) + return sd_bus_error_setf(error, BUS_ERROR_NO_ISOLATION, "Operation refused, unit may not be isolated."); + + if (mode == JOB_TRIGGERING && type != JOB_STOP) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=triggering is only valid for stop."); + + log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode)); + + type = job_type_collapse(type, unit); + + tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY); + if (!tr) + return -ENOMEM; + + r = transaction_add_job_and_dependencies(tr, type, unit, NULL, true, false, + IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS), + mode == JOB_IGNORE_DEPENDENCIES, error); + if (r < 0) + goto tr_abort; + + if (mode == JOB_ISOLATE) { + r = transaction_add_isolate_jobs(tr, m); + if (r < 0) + goto tr_abort; + } + + if (mode == JOB_TRIGGERING) { + r = transaction_add_triggering_jobs(tr, unit); + if (r < 0) + goto tr_abort; + } + + r = transaction_activate(tr, m, mode, affected_jobs, error); + if (r < 0) + goto tr_abort; + + log_unit_debug(unit, + "Enqueued job %s/%s as %u", unit->id, + job_type_to_string(type), (unsigned) tr->anchor_job->id); + + if (ret) + *ret = tr->anchor_job; + + transaction_free(tr); + return 0; + +tr_abort: + transaction_abort(tr); + transaction_free(tr); + return r; +} + +int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **ret) { + Unit *unit = NULL; /* just to appease gcc, initialization is not really necessary */ + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(name); + assert(mode < _JOB_MODE_MAX); + + r = manager_load_unit(m, name, NULL, NULL, &unit); + if (r < 0) + return r; + assert(unit); + + return manager_add_job(m, type, unit, mode, affected_jobs, e, ret); +} + +int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(m); + assert(type < _JOB_TYPE_MAX); + assert(name); + assert(mode < _JOB_MODE_MAX); + + r = manager_add_job_by_name(m, type, name, mode, affected_jobs, &error, ret); + if (r < 0) + return log_warning_errno(r, "Failed to enqueue %s job for %s: %s", job_mode_to_string(mode), name, bus_error_message(&error, r)); + + return r; +} + +int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) { + int r; + Transaction *tr; + + assert(m); + assert(unit); + assert(mode < _JOB_MODE_MAX); + assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */ + + tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY); + if (!tr) + return -ENOMEM; + + /* We need an anchor job */ + r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, false, false, true, true, e); + if (r < 0) + goto tr_abort; + + /* Failure in adding individual dependencies is ignored, so this always succeeds. */ + transaction_add_propagate_reload_jobs(tr, unit, tr->anchor_job, mode == JOB_IGNORE_DEPENDENCIES, e); + + r = transaction_activate(tr, m, mode, NULL, e); + if (r < 0) + goto tr_abort; + + transaction_free(tr); + return 0; + +tr_abort: + transaction_abort(tr); + transaction_free(tr); + return r; +} + +Job *manager_get_job(Manager *m, uint32_t id) { + assert(m); + + return hashmap_get(m->jobs, UINT32_TO_PTR(id)); +} + +Unit *manager_get_unit(Manager *m, const char *name) { + assert(m); + assert(name); + + return hashmap_get(m->units, name); +} + +static int manager_dispatch_target_deps_queue(Manager *m) { + Unit *u; + int r = 0; + + static const UnitDependency deps[] = { + UNIT_REQUIRED_BY, + UNIT_REQUISITE_OF, + UNIT_WANTED_BY, + UNIT_BOUND_BY + }; + + assert(m); + + while ((u = m->target_deps_queue)) { + assert(u->in_target_deps_queue); + + LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u); + u->in_target_deps_queue = false; + + for (size_t k = 0; k < ELEMENTSOF(deps); k++) { + Unit *target; + void *v; + + HASHMAP_FOREACH_KEY(v, target, u->dependencies[deps[k]]) { + r = unit_add_default_target_dependency(u, target); + if (r < 0) + return r; + } + } + } + + return r; +} + +unsigned manager_dispatch_load_queue(Manager *m) { + Unit *u; + unsigned n = 0; + + assert(m); + + /* Make sure we are not run recursively */ + if (m->dispatching_load_queue) + return 0; + + m->dispatching_load_queue = true; + + /* Dispatches the load queue. Takes a unit from the queue and + * tries to load its data until the queue is empty */ + + while ((u = m->load_queue)) { + assert(u->in_load_queue); + + unit_load(u); + n++; + } + + m->dispatching_load_queue = false; + + /* Dispatch the units waiting for their target dependencies to be added now, as all targets that we know about + * should be loaded and have aliases resolved */ + (void) manager_dispatch_target_deps_queue(m); + + return n; +} + +bool manager_unit_cache_should_retry_load(Unit *u) { + assert(u); + + /* Automatic reloading from disk only applies to units which were not found sometime in the past, and + * the not-found stub is kept pinned in the unit graph by dependencies. For units that were + * previously loaded, we don't do automatic reloading, and daemon-reload is necessary to update. */ + if (u->load_state != UNIT_NOT_FOUND) + return false; + + /* The cache has been updated since the last time we tried to load the unit. There might be new + * fragment paths to read. */ + if (u->manager->unit_cache_timestamp_hash != u->fragment_not_found_timestamp_hash) + return true; + + /* The cache needs to be updated because there are modifications on disk. */ + return !lookup_paths_timestamp_hash_same(&u->manager->lookup_paths, u->manager->unit_cache_timestamp_hash, NULL); +} + +int manager_load_unit_prepare( + Manager *m, + const char *name, + const char *path, + sd_bus_error *e, + Unit **_ret) { + + _cleanup_(unit_freep) Unit *cleanup_ret = NULL; + Unit *ret; + UnitType t; + int r; + + assert(m); + assert(_ret); + + /* This will prepare the unit for loading, but not actually load anything from disk. */ + + if (path && !path_is_absolute(path)) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path); + + if (!name) { + /* 'name' and 'path' must not both be null. Check here 'path' using assert_se() to + * workaround a bug in gcc that generates a -Wnonnull warning when calling basename(), + * but this cannot be possible in any code path (See #6119). */ + assert_se(path); + name = basename(path); + } + + t = unit_name_to_type(name); + + if (t == _UNIT_TYPE_INVALID || !unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is missing the instance name.", name); + + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is not valid.", name); + } + + ret = manager_get_unit(m, name); + if (ret) { + /* The time-based cache allows to start new units without daemon-reload, + * but if they are already referenced (because of dependencies or ordering) + * then we have to force a load of the fragment. As an optimization, check + * first if anything in the usual paths was modified since the last time + * the cache was loaded. Also check if the last time an attempt to load the + * unit was made was before the most recent cache refresh, so that we know + * we need to try again — even if the cache is current, it might have been + * updated in a different context before we had a chance to retry loading + * this particular unit. */ + if (manager_unit_cache_should_retry_load(ret)) + ret->load_state = UNIT_STUB; + else { + *_ret = ret; + return 1; + } + } else { + ret = cleanup_ret = unit_new(m, unit_vtable[t]->object_size); + if (!ret) + return -ENOMEM; + } + + if (path) { + r = free_and_strdup(&ret->fragment_path, path); + if (r < 0) + return r; + } + + r = unit_add_name(ret, name); + if (r < 0) + return r; + + unit_add_to_load_queue(ret); + unit_add_to_dbus_queue(ret); + unit_add_to_gc_queue(ret); + + *_ret = ret; + cleanup_ret = NULL; + + return 0; +} + +int manager_load_unit( + Manager *m, + const char *name, + const char *path, + sd_bus_error *e, + Unit **_ret) { + + int r; + + assert(m); + assert(_ret); + + /* This will load the service information files, but not actually + * start any services or anything. */ + + r = manager_load_unit_prepare(m, name, path, e, _ret); + if (r != 0) + return r; + + manager_dispatch_load_queue(m); + + *_ret = unit_follow_merge(*_ret); + return 0; +} + +int manager_load_startable_unit_or_warn( + Manager *m, + const char *name, + const char *path, + Unit **ret) { + + /* Load a unit, make sure it loaded fully and is not masked. */ + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *unit; + int r; + + r = manager_load_unit(m, name, path, &error, &unit); + if (r < 0) + return log_error_errno(r, "Failed to load %s %s: %s", + name ? "unit" : "unit file", name ?: path, + bus_error_message(&error, r)); + + r = bus_unit_validate_load_state(unit, &error); + if (r < 0) + return log_error_errno(r, "%s", bus_error_message(&error, r)); + + *ret = unit; + return 0; +} + +void manager_dump_jobs(Manager *s, FILE *f, const char *prefix) { + Job *j; + + assert(s); + assert(f); + + HASHMAP_FOREACH(j, s->jobs) + job_dump(j, f, prefix); +} + +void manager_dump_units(Manager *s, FILE *f, const char *prefix) { + Unit *u; + const char *t; + + assert(s); + assert(f); + + HASHMAP_FOREACH_KEY(u, t, s->units) + if (u->id == t) + unit_dump(u, f, prefix); +} + +void manager_dump(Manager *m, FILE *f, const char *prefix) { + assert(m); + assert(f); + + for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + const dual_timestamp *t = m->timestamps + q; + char buf[CONST_MAX(FORMAT_TIMESPAN_MAX, FORMAT_TIMESTAMP_MAX)]; + + if (dual_timestamp_is_set(t)) + fprintf(f, "%sTimestamp %s: %s\n", + strempty(prefix), + manager_timestamp_to_string(q), + timestamp_is_set(t->realtime) ? format_timestamp(buf, sizeof buf, t->realtime) : + format_timespan(buf, sizeof buf, t->monotonic, 1)); + } + + manager_dump_units(m, f, prefix); + manager_dump_jobs(m, f, prefix); +} + +int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size; + int r; + + assert(m); + assert(ret); + + f = open_memstream_unlocked(&dump, &size); + if (!f) + return -errno; + + manager_dump(m, f, NULL); + + r = fflush_and_check(f); + if (r < 0) + return r; + + f = safe_fclose(f); + + *ret = TAKE_PTR(dump); + + return 0; +} + +void manager_clear_jobs(Manager *m) { + Job *j; + + assert(m); + + while ((j = hashmap_first(m->jobs))) + /* No need to recurse. We're cancelling all jobs. */ + job_finish_and_invalidate(j, JOB_CANCELED, false, false); +} + +void manager_unwatch_pid(Manager *m, pid_t pid) { + assert(m); + + /* First let's drop the unit keyed as "pid". */ + (void) hashmap_remove(m->watch_pids, PID_TO_PTR(pid)); + + /* Then, let's also drop the array keyed by -pid. */ + free(hashmap_remove(m->watch_pids, PID_TO_PTR(-pid))); +} + +static int manager_dispatch_run_queue(sd_event_source *source, void *userdata) { + Manager *m = userdata; + Job *j; + + assert(source); + assert(m); + + while ((j = prioq_peek(m->run_queue))) { + assert(j->installed); + assert(j->in_run_queue); + + (void) job_run_and_invalidate(j); + } + + if (m->n_running_jobs > 0) + manager_watch_jobs_in_progress(m); + + if (m->n_on_console > 0) + manager_watch_idle_pipe(m); + + return 1; +} + +static unsigned manager_dispatch_dbus_queue(Manager *m) { + unsigned n = 0, budget; + Unit *u; + Job *j; + + assert(m); + + /* When we are reloading, let's not wait with generating signals, since we need to exit the manager as quickly + * as we can. There's no point in throttling generation of signals in that case. */ + if (MANAGER_IS_RELOADING(m) || m->send_reloading_done || m->pending_reload_message) + budget = (unsigned) -1; /* infinite budget in this case */ + else { + /* Anything to do at all? */ + if (!m->dbus_unit_queue && !m->dbus_job_queue) + return 0; + + /* Do we have overly many messages queued at the moment? If so, let's not enqueue more on top, let's + * sit this cycle out, and process things in a later cycle when the queues got a bit emptier. */ + if (manager_bus_n_queued_write(m) > MANAGER_BUS_BUSY_THRESHOLD) + return 0; + + /* Only process a certain number of units/jobs per event loop iteration. Even if the bus queue wasn't + * overly full before this call we shouldn't increase it in size too wildly in one step, and we + * shouldn't monopolize CPU time with generating these messages. Note the difference in counting of + * this "budget" and the "threshold" above: the "budget" is decreased only once per generated message, + * regardless how many buses/direct connections it is enqueued on, while the "threshold" is applied to + * each queued instance of bus message, i.e. if the same message is enqueued to five buses/direct + * connections it will be counted five times. This difference in counting ("references" + * vs. "instances") is primarily a result of the fact that it's easier to implement it this way, + * however it also reflects the thinking that the "threshold" should put a limit on used queue memory, + * i.e. space, while the "budget" should put a limit on time. Also note that the "threshold" is + * currently chosen much higher than the "budget". */ + budget = MANAGER_BUS_MESSAGE_BUDGET; + } + + while (budget != 0 && (u = m->dbus_unit_queue)) { + + assert(u->in_dbus_queue); + + bus_unit_send_change_signal(u); + n++; + + if (budget != (unsigned) -1) + budget--; + } + + while (budget != 0 && (j = m->dbus_job_queue)) { + assert(j->in_dbus_queue); + + bus_job_send_change_signal(j); + n++; + + if (budget != (unsigned) -1) + budget--; + } + + if (m->send_reloading_done) { + m->send_reloading_done = false; + bus_manager_send_reloading(m, false); + n++; + } + + if (m->pending_reload_message) { + bus_send_pending_reload_message(m); + n++; + } + + return n; +} + +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + char buf[PATH_MAX]; + ssize_t n; + + n = recv(fd, buf, sizeof(buf), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read cgroups agent message: %m"); + if (n == 0) { + log_error("Got zero-length cgroups agent message, ignoring."); + return 0; + } + if ((size_t) n >= sizeof(buf)) { + log_error("Got overly long cgroups agent message, ignoring."); + return 0; + } + + if (memchr(buf, 0, n)) { + log_error("Got cgroups agent message with embedded NUL byte, ignoring."); + return 0; + } + buf[n] = 0; + + manager_notify_cgroup_empty(m, buf); + (void) bus_forward_agent_released(m, buf); + + return 0; +} + +static bool manager_process_barrier_fd(char * const *tags, FDSet *fds) { + + /* nothing else must be sent when using BARRIER=1 */ + if (strv_contains(tags, "BARRIER=1")) { + if (strv_length(tags) == 1) { + if (fdset_size(fds) != 1) + log_warning("Got incorrect number of fds with BARRIER=1, closing them."); + } else + log_warning("Extra notification messages sent with BARRIER=1, ignoring everything."); + + /* Drop the message if BARRIER=1 was found */ + return true; + } + + return false; +} + +static void manager_invoke_notify_message( + Manager *m, + Unit *u, + const struct ucred *ucred, + char * const *tags, + FDSet *fds) { + + assert(m); + assert(u); + assert(ucred); + assert(tags); + + if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */ + return; + u->notifygen = m->notifygen; + + if (UNIT_VTABLE(u)->notify_message) + UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds); + + else if (DEBUG_LOGGING) { + _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL; + + buf = strv_join(tags, ", "); + if (buf) + x = ellipsize(buf, 20, 90); + if (x) + y = cescape(x); + + log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y)); + } +} + +static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + + _cleanup_fdset_free_ FDSet *fds = NULL; + Manager *m = userdata; + char buf[NOTIFY_BUFFER_MAX+1]; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + struct cmsghdr *cmsg; + struct ucred *ucred = NULL; + _cleanup_free_ Unit **array_copy = NULL; + _cleanup_strv_free_ char **tags = NULL; + Unit *u1, *u2, **array; + int r, *fd_array = NULL; + size_t n_fds = 0; + bool found = false; + ssize_t n; + + assert(m); + assert(m->notify_fd == fd); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for notify fd."); + return 0; + } + + n = recvmsg_safe(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (IN_SET(n, -EAGAIN, -EINTR)) + return 0; /* Spurious wakeup, try again */ + if (n == -EXFULL) { + log_warning("Got message with truncated control data (too many fds sent?), ignoring."); + return 0; + } + if (n < 0) + /* If this is any other, real error, then let's stop processing this socket. This of course + * means we won't take notification messages anymore, but that's still better than busy + * looping around this: being woken up over and over again but being unable to actually read + * the message off the socket. */ + return log_error_errno(n, "Failed to receive notification message: %m"); + + CMSG_FOREACH(cmsg, &msghdr) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + + assert(!fd_array); + fd_array = (int*) CMSG_DATA(cmsg); + n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + + assert(!ucred); + ucred = (struct ucred*) CMSG_DATA(cmsg); + } + } + + if (n_fds > 0) { + assert(fd_array); + + r = fdset_new_array(&fds, fd_array, n_fds); + if (r < 0) { + close_many(fd_array, n_fds); + log_oom(); + return 0; + } + } + + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } + + if ((size_t) n >= sizeof(buf) || (msghdr.msg_flags & MSG_TRUNC)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + /* As extra safety check, let's make sure the string we get doesn't contain embedded NUL bytes. We permit one + * trailing NUL byte in the message, but don't expect it. */ + if (n > 1 && memchr(buf, 0, n-1)) { + log_warning("Received notify message with embedded NUL bytes. Ignoring."); + return 0; + } + + /* Make sure it's NUL-terminated, then parse it to obtain the tags list */ + buf[n] = 0; + tags = strv_split_newlines(buf); + if (!tags) { + log_oom(); + return 0; + } + + /* possibly a barrier fd, let's see */ + if (manager_process_barrier_fd(tags, fds)) + return 0; + + /* Increase the generation counter used for filtering out duplicate unit invocations. */ + m->notifygen++; + + /* Notify every unit that might be interested, which might be multiple. */ + u1 = manager_get_unit_by_pid_cgroup(m, ucred->pid); + u2 = hashmap_get(m->watch_pids, PID_TO_PTR(ucred->pid)); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-ucred->pid)); + if (array) { + size_t k = 0; + + while (array[k]) + k++; + + array_copy = newdup(Unit*, array, k+1); + if (!array_copy) + log_oom(); + } + /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle duplicate units + * make sure we only invoke each unit's handler once. */ + if (u1) { + manager_invoke_notify_message(m, u1, ucred, tags, fds); + found = true; + } + if (u2) { + manager_invoke_notify_message(m, u2, ucred, tags, fds); + found = true; + } + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) { + manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds); + found = true; + } + + if (!found) + log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid); + + if (fdset_size(fds) > 0) + log_warning("Got extra auxiliary fds with notification message, closing them."); + + return 0; +} + +static void manager_invoke_sigchld_event( + Manager *m, + Unit *u, + const siginfo_t *si) { + + assert(m); + assert(u); + assert(si); + + /* Already invoked the handler of this unit in this iteration? Then don't process this again */ + if (u->sigchldgen == m->sigchldgen) + return; + u->sigchldgen = m->sigchldgen; + + log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id); + unit_unwatch_pid(u, si->si_pid); + + if (UNIT_VTABLE(u)->sigchld_event) + UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); +} + +static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { + Manager *m = userdata; + siginfo_t si = {}; + int r; + + assert(source); + assert(m); + + /* First we call waitid() for a PID and do not reap the zombie. That way we can still access /proc/$PID for it + * while it is a zombie. */ + + if (waitid(P_ALL, 0, &si, WEXITED|WNOHANG|WNOWAIT) < 0) { + + if (errno != ECHILD) + log_error_errno(errno, "Failed to peek for child with waitid(), ignoring: %m"); + + goto turn_off; + } + + if (si.si_pid <= 0) + goto turn_off; + + if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) { + _cleanup_free_ Unit **array_copy = NULL; + _cleanup_free_ char *name = NULL; + Unit *u1, *u2, **array; + + (void) get_process_comm(si.si_pid, &name); + + log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)", + si.si_pid, strna(name), + sigchld_code_to_string(si.si_code), + si.si_status, + strna(si.si_code == CLD_EXITED + ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL) + : signal_to_string(si.si_status))); + + /* Increase the generation counter used for filtering out duplicate unit invocations */ + m->sigchldgen++; + + /* And now figure out the unit this belongs to, it might be multiple... */ + u1 = manager_get_unit_by_pid_cgroup(m, si.si_pid); + u2 = hashmap_get(m->watch_pids, PID_TO_PTR(si.si_pid)); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-si.si_pid)); + if (array) { + size_t n = 0; + + /* Count how many entries the array has */ + while (array[n]) + n++; + + /* Make a copy of the array so that we don't trip up on the array changing beneath us */ + array_copy = newdup(Unit*, array, n+1); + if (!array_copy) + log_oom(); + } + + /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but + * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for + * each iteration. */ + if (u1) { + /* We check for oom condition, in case we got SIGCHLD before the oom notification. + * We only do this for the cgroup the PID belonged to. */ + (void) unit_check_oom(u1); + + /* This only logs for now. In the future when the interface for kills/notifications + * is more stable we can extend service results table similar to how kernel oom kills + * are managed. */ + (void) unit_check_oomd_kill(u1); + + manager_invoke_sigchld_event(m, u1, &si); + } + if (u2) + manager_invoke_sigchld_event(m, u2, &si); + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) + manager_invoke_sigchld_event(m, array_copy[i], &si); + } + + /* And now, we actually reap the zombie. */ + if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) { + log_error_errno(errno, "Failed to dequeue child, ignoring: %m"); + return 0; + } + + return 0; + +turn_off: + /* All children processed for now, turn off event source */ + + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to disable SIGCHLD event source: %m"); + + return 0; +} + +static void manager_start_target(Manager *m, const char *name, JobMode mode) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + log_debug("Activating special unit %s", name); + + r = manager_add_job_by_name(m, JOB_START, name, mode, NULL, &error, NULL); + if (r < 0) + log_error("Failed to enqueue %s job: %s", name, bus_error_message(&error, r)); +} + +static void manager_handle_ctrl_alt_del(Manager *m) { + /* If the user presses C-A-D more than + * 7 times within 2s, we reboot/shutdown immediately, + * unless it was disabled in system.conf */ + + if (ratelimit_below(&m->ctrl_alt_del_ratelimit) || m->cad_burst_action == EMERGENCY_ACTION_NONE) + manager_start_target(m, SPECIAL_CTRL_ALT_DEL_TARGET, JOB_REPLACE_IRREVERSIBLY); + else + emergency_action(m, m->cad_burst_action, EMERGENCY_ACTION_WARN, NULL, -1, + "Ctrl-Alt-Del was pressed more than 7 times within 2s"); +} + +static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + ssize_t n; + struct signalfd_siginfo sfsi; + int r; + + assert(m); + assert(m->signal_fd == fd); + + if (revents != EPOLLIN) { + log_warning("Got unexpected events from signal file descriptor."); + return 0; + } + + n = read(m->signal_fd, &sfsi, sizeof(sfsi)); + if (n != sizeof(sfsi)) { + if (n >= 0) { + log_warning("Truncated read from signal fd (%zu bytes), ignoring!", n); + return 0; + } + + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + /* We return an error here, which will kill this handler, + * to avoid a busy loop on read error. */ + return log_error_errno(errno, "Reading from signal fd failed: %m"); + } + + log_received_signal(sfsi.ssi_signo == SIGCHLD || + (sfsi.ssi_signo == SIGTERM && MANAGER_IS_USER(m)) + ? LOG_DEBUG : LOG_INFO, + &sfsi); + + switch (sfsi.ssi_signo) { + + case SIGCHLD: + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to enable SIGCHLD event source, ignoring: %m"); + + break; + + case SIGTERM: + if (MANAGER_IS_SYSTEM(m)) { + /* This is for compatibility with the original sysvinit */ + if (verify_run_space_and_log("Refusing to reexecute") < 0) + break; + + m->objective = MANAGER_REEXECUTE; + break; + } + + _fallthrough_; + case SIGINT: + if (MANAGER_IS_SYSTEM(m)) + manager_handle_ctrl_alt_del(m); + else + manager_start_target(m, SPECIAL_EXIT_TARGET, + JOB_REPLACE_IRREVERSIBLY); + break; + + case SIGWINCH: + /* This is a nop on non-init */ + if (MANAGER_IS_SYSTEM(m)) + manager_start_target(m, SPECIAL_KBREQUEST_TARGET, JOB_REPLACE); + + break; + + case SIGPWR: + /* This is a nop on non-init */ + if (MANAGER_IS_SYSTEM(m)) + manager_start_target(m, SPECIAL_SIGPWR_TARGET, JOB_REPLACE); + + break; + + case SIGUSR1: + if (manager_dbus_is_running(m, false)) { + log_info("Trying to reconnect to bus..."); + + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } else { + log_info("Starting D-Bus service..."); + manager_start_target(m, SPECIAL_DBUS_SERVICE, JOB_REPLACE); + } + + break; + + case SIGUSR2: { + _cleanup_free_ char *dump = NULL; + + r = manager_get_dump_string(m, &dump); + if (r < 0) { + log_warning_errno(errno, "Failed to acquire manager dump: %m"); + break; + } + + log_dump(LOG_INFO, dump); + break; + } + + case SIGHUP: + if (verify_run_space_and_log("Refusing to reload") < 0) + break; + + m->objective = MANAGER_RELOAD; + break; + + default: { + + /* Starting SIGRTMIN+0 */ + static const struct { + const char *target; + JobMode mode; + } target_table[] = { + [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE }, + [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE }, + [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE }, + [3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY }, + [6] = { SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY }, + }; + + /* Starting SIGRTMIN+13, so that target halt and system halt are 10 apart */ + static const ManagerObjective objective_table[] = { + [0] = MANAGER_HALT, + [1] = MANAGER_POWEROFF, + [2] = MANAGER_REBOOT, + [3] = MANAGER_KEXEC, + }; + + if ((int) sfsi.ssi_signo >= SIGRTMIN+0 && + (int) sfsi.ssi_signo < SIGRTMIN+(int) ELEMENTSOF(target_table)) { + int idx = (int) sfsi.ssi_signo - SIGRTMIN; + manager_start_target(m, target_table[idx].target, + target_table[idx].mode); + break; + } + + if ((int) sfsi.ssi_signo >= SIGRTMIN+13 && + (int) sfsi.ssi_signo < SIGRTMIN+13+(int) ELEMENTSOF(objective_table)) { + m->objective = objective_table[sfsi.ssi_signo - SIGRTMIN - 13]; + break; + } + + switch (sfsi.ssi_signo - SIGRTMIN) { + + case 20: + manager_override_show_status(m, SHOW_STATUS_YES, "signal"); + break; + + case 21: + manager_override_show_status(m, SHOW_STATUS_NO, "signal"); + break; + + case 22: + manager_override_log_level(m, LOG_DEBUG); + break; + + case 23: + manager_restore_original_log_level(m); + break; + + case 24: + if (MANAGER_IS_USER(m)) { + m->objective = MANAGER_EXIT; + return 0; + } + + /* This is a nop on init */ + break; + + case 26: + case 29: /* compatibility: used to be mapped to LOG_TARGET_SYSLOG_OR_KMSG */ + manager_restore_original_log_target(m); + break; + + case 27: + manager_override_log_target(m, LOG_TARGET_CONSOLE); + break; + + case 28: + manager_override_log_target(m, LOG_TARGET_KMSG); + break; + + default: + log_warning("Got unhandled signal <%s>.", signal_to_string(sfsi.ssi_signo)); + } + }} + + return 0; +} + +static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + Unit *u; + + assert(m); + assert(m->time_change_fd == fd); + + log_struct(LOG_DEBUG, + "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR, + LOG_MESSAGE("Time has been changed")); + + /* Restart the watch */ + (void) manager_setup_time_change(m); + + HASHMAP_FOREACH(u, m->units) + if (UNIT_VTABLE(u)->time_change) + UNIT_VTABLE(u)->time_change(u); + + return 0; +} + +static int manager_dispatch_timezone_change( + sd_event_source *source, + const struct inotify_event *e, + void *userdata) { + + Manager *m = userdata; + int changed; + Unit *u; + + assert(m); + + log_debug("inotify event for /etc/localtime"); + + changed = manager_read_timezone_stat(m); + if (changed <= 0) + return changed; + + /* Something changed, restart the watch, to ensure we watch the new /etc/localtime if it changed */ + (void) manager_setup_timezone_change(m); + + /* Read the new timezone */ + tzset(); + + log_debug("Timezone has been changed (now: %s).", tzname[daylight]); + + HASHMAP_FOREACH(u, m->units) + if (UNIT_VTABLE(u)->timezone_change) + UNIT_VTABLE(u)->timezone_change(u); + + return 0; +} + +static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(m); + assert(m->idle_pipe[2] == fd); + + /* There's at least one Type=idle child that just gave up on us waiting for the boot process to complete. Let's + * now turn off any further console output if there's at least one service that needs console access, so that + * from now on our own output should not spill into that service's output anymore. After all, we support + * Type=idle only to beautify console output and it generally is set on services that want to own the console + * exclusively without our interference. */ + m->no_console_output = m->n_on_console > 0; + + /* Acknowledge the child's request, and let all all other children know too that they shouldn't wait any longer + * by closing the pipes towards them, which is what they are waiting for. */ + manager_close_idle_pipe(m); + + return 0; +} + +static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata) { + Manager *m = userdata; + int r; + + assert(m); + assert(source); + + manager_print_jobs_in_progress(m); + + r = sd_event_source_set_time_relative(source, JOBS_IN_PROGRESS_PERIOD_USEC); + if (r < 0) + return r; + + return sd_event_source_set_enabled(source, SD_EVENT_ONESHOT); +} + +int manager_loop(Manager *m) { + RateLimit rl = { .interval = 1*USEC_PER_SEC, .burst = 50000 }; + int r; + + assert(m); + assert(m->objective == MANAGER_OK); /* Ensure manager_startup() has been called */ + + manager_check_finished(m); + + /* There might still be some zombies hanging around from before we were exec()'ed. Let's reap them. */ + r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to enable SIGCHLD event source: %m"); + + while (m->objective == MANAGER_OK) { + usec_t wait_usec, watchdog_usec; + + watchdog_usec = manager_get_watchdog(m, WATCHDOG_RUNTIME); + if (m->runtime_watchdog_running) + (void) watchdog_ping(); + else if (timestamp_is_set(watchdog_usec)) + manager_retry_runtime_watchdog(m); + + if (!ratelimit_below(&rl)) { + /* Yay, something is going seriously wrong, pause a little */ + log_warning("Looping too fast. Throttling execution a little."); + sleep(1); + } + + if (manager_dispatch_load_queue(m) > 0) + continue; + + if (manager_dispatch_gc_job_queue(m) > 0) + continue; + + if (manager_dispatch_gc_unit_queue(m) > 0) + continue; + + if (manager_dispatch_cleanup_queue(m) > 0) + continue; + + if (manager_dispatch_cgroup_realize_queue(m) > 0) + continue; + + if (manager_dispatch_stop_when_unneeded_queue(m) > 0) + continue; + + if (manager_dispatch_dbus_queue(m) > 0) + continue; + + /* Sleep for watchdog runtime wait time */ + if (timestamp_is_set(watchdog_usec)) + wait_usec = watchdog_runtime_wait(); + else + wait_usec = USEC_INFINITY; + + r = sd_event_run(m->event, wait_usec); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + } + + return m->objective; +} + +int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) { + _cleanup_free_ char *n = NULL; + sd_id128_t invocation_id; + Unit *u; + int r; + + assert(m); + assert(s); + assert(_u); + + r = unit_name_from_dbus_path(s, &n); + if (r < 0) + return r; + + /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128bit ID then we use it + * as invocation ID. */ + r = sd_id128_from_string(n, &invocation_id); + if (r >= 0) { + u = hashmap_get(m->units_by_invocation_id, &invocation_id); + if (u) { + *_u = u; + return 0; + } + + return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, + "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", + SD_ID128_FORMAT_VAL(invocation_id)); + } + + /* If this didn't work, we check if this is a unit name */ + if (!unit_name_is_valid(n, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + _cleanup_free_ char *nn = NULL; + + nn = cescape(n); + return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, + "Unit name %s is neither a valid invocation ID nor unit name.", strnull(nn)); + } + + r = manager_load_unit(m, n, NULL, e, &u); + if (r < 0) + return r; + + *_u = u; + return 0; +} + +int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j) { + const char *p; + unsigned id; + Job *j; + int r; + + assert(m); + assert(s); + assert(_j); + + p = startswith(s, "/org/freedesktop/systemd1/job/"); + if (!p) + return -EINVAL; + + r = safe_atou(p, &id); + if (r < 0) + return r; + + j = manager_get_job(m, id); + if (!j) + return -ENOENT; + + *_j = j; + + return 0; +} + +void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) { + +#if HAVE_AUDIT + _cleanup_free_ char *p = NULL; + const char *msg; + int audit_fd, r; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + audit_fd = get_audit_fd(); + if (audit_fd < 0) + return; + + /* Don't generate audit events if the service was already + * started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + + if (u->type != UNIT_SERVICE) + return; + + r = unit_name_to_prefix_and_instance(u->id, &p); + if (r < 0) { + log_error_errno(r, "Failed to extract prefix and instance of unit name: %m"); + return; + } + + msg = strjoina("unit=", p); + if (audit_log_user_comm_message(audit_fd, type, msg, "systemd", NULL, NULL, NULL, success) < 0) { + if (errno == EPERM) + /* We aren't allowed to send audit messages? + * Then let's not retry again. */ + close_audit_fd(); + else + log_warning_errno(errno, "Failed to send audit message: %m"); + } +#endif + +} + +void manager_send_unit_plymouth(Manager *m, Unit *u) { + static const union sockaddr_union sa = PLYMOUTH_SOCKET; + _cleanup_free_ char *message = NULL; + _cleanup_close_ int fd = -1; + int n = 0; + + /* Don't generate plymouth events if the service was already + * started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (detect_container() > 0) + return; + + if (!IN_SET(u->type, UNIT_SERVICE, UNIT_MOUNT, UNIT_SWAP)) + return; + + /* We set SOCK_NONBLOCK here so that we rather drop the + * message then wait for plymouth */ + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) { + log_error_errno(errno, "socket() failed: %m"); + return; + } + + if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) { + if (!IN_SET(errno, EAGAIN, ENOENT) && !ERRNO_IS_DISCONNECT(errno)) + log_error_errno(errno, "connect() failed: %m"); + return; + } + + if (asprintf(&message, "U\002%c%s%n", (int) (strlen(u->id) + 1), u->id, &n) < 0) { + log_oom(); + return; + } + + errno = 0; + if (write(fd, message, n + 1) != n + 1) + if (!IN_SET(errno, EAGAIN, ENOENT) && !ERRNO_IS_DISCONNECT(errno)) + log_error_errno(errno, "Failed to write Plymouth message: %m"); +} + +int manager_open_serialization(Manager *m, FILE **_f) { + _cleanup_close_ int fd = -1; + FILE *f; + + assert(_f); + + fd = open_serialization_fd("systemd-state"); + if (fd < 0) + return fd; + + f = take_fdopen(&fd, "w+"); + if (!f) + return -errno; + + *_f = f; + return 0; +} + +static bool manager_timestamp_shall_serialize(ManagerTimestamp t) { + + if (!in_initrd()) + return true; + + /* The following timestamps only apply to the host system, hence only serialize them there */ + return !IN_SET(t, + MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH, + MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH, + MANAGER_TIMESTAMP_GENERATORS_START, MANAGER_TIMESTAMP_GENERATORS_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD_START, MANAGER_TIMESTAMP_UNITS_LOAD_FINISH); +} + +#define DESTROY_IPC_FLAG (UINT32_C(1) << 31) + +static void manager_serialize_uid_refs_internal( + Manager *m, + FILE *f, + Hashmap **uid_refs, + const char *field_name) { + + void *p, *k; + + assert(m); + assert(f); + assert(uid_refs); + assert(field_name); + + /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as + * the actual counter of it is better rebuild after a reload/reexec. */ + + HASHMAP_FOREACH_KEY(p, k, *uid_refs) { + uint32_t c; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + if (!(c & DESTROY_IPC_FLAG)) + continue; + + (void) serialize_item_format(f, field_name, UID_FMT, uid); + } +} + +static void manager_serialize_uid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(m, f, &m->uid_refs, "destroy-ipc-uid"); +} + +static void manager_serialize_gid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid"); +} + +int manager_serialize( + Manager *m, + FILE *f, + FDSet *fds, + bool switching_root) { + + const char *t; + Unit *u; + int r; + + assert(m); + assert(f); + assert(fds); + + _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m); + + (void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id); + (void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs); + (void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs); + (void) serialize_bool(f, "taint-usr", m->taint_usr); + (void) serialize_bool(f, "ready-sent", m->ready_sent); + (void) serialize_bool(f, "taint-logged", m->taint_logged); + (void) serialize_bool(f, "service-watchdogs", m->service_watchdogs); + + /* After switching root, udevd has not been started yet. So, enumeration results should not be emitted. */ + (void) serialize_bool(f, "honor-device-enumeration", !switching_root); + + if (m->show_status_overridden != _SHOW_STATUS_INVALID) + (void) serialize_item(f, "show-status-overridden", + show_status_to_string(m->show_status_overridden)); + + if (m->log_level_overridden) + (void) serialize_item_format(f, "log-level-override", "%i", log_get_max_level()); + if (m->log_target_overridden) + (void) serialize_item(f, "log-target-override", log_target_to_string(log_get_target())); + + (void) serialize_usec(f, "runtime-watchdog-overridden", m->watchdog_overridden[WATCHDOG_RUNTIME]); + (void) serialize_usec(f, "reboot-watchdog-overridden", m->watchdog_overridden[WATCHDOG_REBOOT]); + (void) serialize_usec(f, "kexec-watchdog-overridden", m->watchdog_overridden[WATCHDOG_KEXEC]); + + for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + _cleanup_free_ char *joined = NULL; + + if (!manager_timestamp_shall_serialize(q)) + continue; + + joined = strjoin(manager_timestamp_to_string(q), "-timestamp"); + if (!joined) + return log_oom(); + + (void) serialize_dual_timestamp(f, joined, m->timestamps + q); + } + + if (!switching_root) + (void) serialize_strv(f, "env", m->client_environment); + + if (m->notify_fd >= 0) { + r = serialize_fd(f, fds, "notify-fd", m->notify_fd); + if (r < 0) + return r; + + (void) serialize_item(f, "notify-socket", m->notify_socket); + } + + if (m->cgroups_agent_fd >= 0) { + r = serialize_fd(f, fds, "cgroups-agent-fd", m->cgroups_agent_fd); + if (r < 0) + return r; + } + + if (m->user_lookup_fds[0] >= 0) { + int copy0, copy1; + + copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]); + if (copy0 < 0) + return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m"); + + copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]); + if (copy1 < 0) + return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m"); + + (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1); + } + + bus_track_serialize(m->subscribed, f, "subscribed"); + + r = dynamic_user_serialize(m, f, fds); + if (r < 0) + return r; + + manager_serialize_uid_refs(m, f); + manager_serialize_gid_refs(m, f); + + r = exec_runtime_serialize(m, f, fds); + if (r < 0) + return r; + + (void) fputc('\n', f); + + HASHMAP_FOREACH_KEY(u, t, m->units) { + if (u->id != t) + continue; + + /* Start marker */ + fputs(u->id, f); + fputc('\n', f); + + r = unit_serialize(u, f, fds, !switching_root); + if (r < 0) + return r; + } + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to flush serialization: %m"); + + r = bus_fdset_add_all(m, fds); + if (r < 0) + return log_error_errno(r, "Failed to add bus sockets to serialization: %m"); + + return 0; +} + +static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) { + Unit *u; + int r; + + r = manager_load_unit(m, name, NULL, NULL, &u); + if (r < 0) { + if (r == -ENOMEM) + return r; + return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name); + } + + r = unit_deserialize(u, f, fds); + if (r < 0) { + if (r == -ENOMEM) + return r; + return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name); + } + + return 0; +} + +static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) { + const char *unit_name; + int r; + + for (;;) { + _cleanup_free_ char *line = NULL; + /* Start marker */ + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + break; + + unit_name = strstrip(line); + + r = manager_deserialize_one_unit(m, unit_name, f, fds); + if (r == -ENOMEM) + return r; + if (r < 0) { + r = unit_deserialize_skip(f); + if (r < 0) + return r; + } + } + + return 0; +} + +usec_t manager_get_watchdog(Manager *m, WatchdogType t) { + assert(m); + + if (MANAGER_IS_USER(m)) + return USEC_INFINITY; + + if (timestamp_is_set(m->watchdog_overridden[t])) + return m->watchdog_overridden[t]; + + return m->watchdog[t]; +} + +void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) { + int r = 0; + + assert(m); + + if (MANAGER_IS_USER(m)) + return; + + if (m->watchdog[t] == timeout) + return; + + if (t == WATCHDOG_RUNTIME) + if (!timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME])) { + if (timestamp_is_set(timeout)) { + r = watchdog_set_timeout(&timeout); + + if (r >= 0) + m->runtime_watchdog_running = true; + } else { + watchdog_close(true); + m->runtime_watchdog_running = false; + } + } + + m->watchdog[t] = timeout; +} + +int manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) { + int r = 0; + + assert(m); + + if (MANAGER_IS_USER(m)) + return 0; + + if (m->watchdog_overridden[t] == timeout) + return 0; + + if (t == WATCHDOG_RUNTIME) { + usec_t *p; + + p = timestamp_is_set(timeout) ? &timeout : &m->watchdog[t]; + if (timestamp_is_set(*p)) { + r = watchdog_set_timeout(p); + + if (r >= 0) + m->runtime_watchdog_running = true; + } else { + watchdog_close(true); + m->runtime_watchdog_running = false; + } + } + + m->watchdog_overridden[t] = timeout; + + return 0; +} + +void manager_retry_runtime_watchdog(Manager *m) { + int r = 0; + + assert(m); + + if (timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME])) + r = watchdog_set_timeout(&m->watchdog_overridden[WATCHDOG_RUNTIME]); + else + r = watchdog_set_timeout(&m->watchdog[WATCHDOG_RUNTIME]); + + if (r >= 0) + m->runtime_watchdog_running = true; +} + +static void manager_deserialize_uid_refs_one_internal( + Manager *m, + Hashmap** uid_refs, + const char *value) { + + uid_t uid; + uint32_t c; + int r; + + assert(m); + assert(uid_refs); + assert(value); + + r = parse_uid(value, &uid); + if (r < 0 || uid == 0) { + log_debug("Unable to parse UID reference serialization: " UID_FMT, uid); + return; + } + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) { + log_oom(); + return; + } + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + if (c & DESTROY_IPC_FLAG) + return; + + c |= DESTROY_IPC_FLAG; + + r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); + if (r < 0) { + log_debug_errno(r, "Failed to add UID reference entry: %m"); + return; + } +} + +static void manager_deserialize_uid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(m, &m->uid_refs, value); +} + +static void manager_deserialize_gid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(m, &m->gid_refs, value); +} + +int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { + int r = 0; + + assert(m); + assert(f); + + if (DEBUG_LOGGING) { + if (fdset_isempty(fds)) + log_debug("No file descriptors passed"); + else { + int fd; + + FDSET_FOREACH(fd, fds) { + _cleanup_free_ char *fn = NULL; + + r = fd_get_path(fd, &fn); + if (r < 0) + log_debug_errno(r, "Received serialized fd %i → %m", fd); + else + log_debug("Received serialized fd %i → %s", fd, strna(fn)); + } + } + } + + log_debug("Deserializing state..."); + + /* If we are not in reload mode yet, enter it now. Not that this is recursive, a caller might already have + * increased it to non-zero, which is why we just increase it by one here and down again at the end of this + * call. */ + _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m); + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *val, *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + break; + + l = strstrip(line); + if (isempty(l)) /* end marker */ + break; + + if ((val = startswith(l, "current-job-id="))) { + uint32_t id; + + if (safe_atou32(val, &id) < 0) + log_notice("Failed to parse current job id value '%s', ignoring.", val); + else + m->current_job_id = MAX(m->current_job_id, id); + + } else if ((val = startswith(l, "n-installed-jobs="))) { + uint32_t n; + + if (safe_atou32(val, &n) < 0) + log_notice("Failed to parse installed jobs counter '%s', ignoring.", val); + else + m->n_installed_jobs += n; + + } else if ((val = startswith(l, "n-failed-jobs="))) { + uint32_t n; + + if (safe_atou32(val, &n) < 0) + log_notice("Failed to parse failed jobs counter '%s', ignoring.", val); + else + m->n_failed_jobs += n; + + } else if ((val = startswith(l, "taint-usr="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse taint /usr flag '%s', ignoring.", val); + else + m->taint_usr = m->taint_usr || b; + + } else if ((val = startswith(l, "ready-sent="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse ready-sent flag '%s', ignoring.", val); + else + m->ready_sent = m->ready_sent || b; + + } else if ((val = startswith(l, "taint-logged="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse taint-logged flag '%s', ignoring.", val); + else + m->taint_logged = m->taint_logged || b; + + } else if ((val = startswith(l, "service-watchdogs="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val); + else + m->service_watchdogs = b; + + } else if ((val = startswith(l, "honor-device-enumeration="))) { + int b; + + b = parse_boolean(val); + if (b < 0) + log_notice("Failed to parse honor-device-enumeration flag '%s', ignoring.", val); + else + m->honor_device_enumeration = b; + + } else if ((val = startswith(l, "show-status-overridden="))) { + ShowStatus s; + + s = show_status_from_string(val); + if (s < 0) + log_notice("Failed to parse show-status-overridden flag '%s', ignoring.", val); + else + manager_override_show_status(m, s, "deserialize"); + + } else if ((val = startswith(l, "log-level-override="))) { + int level; + + level = log_level_from_string(val); + if (level < 0) + log_notice("Failed to parse log-level-override value '%s', ignoring.", val); + else + manager_override_log_level(m, level); + + } else if ((val = startswith(l, "log-target-override="))) { + LogTarget target; + + target = log_target_from_string(val); + if (target < 0) + log_notice("Failed to parse log-target-override value '%s', ignoring.", val); + else + manager_override_log_target(m, target); + + } else if ((val = startswith(l, "runtime-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse runtime-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_RUNTIME, t); + + } else if ((val = startswith(l, "reboot-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse reboot-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_REBOOT, t); + + } else if ((val = startswith(l, "kexec-watchdog-overridden="))) { + usec_t t; + + if (deserialize_usec(val, &t) < 0) + log_notice("Failed to parse kexec-watchdog-overridden value '%s', ignoring.", val); + else + manager_override_watchdog(m, WATCHDOG_KEXEC, t); + + } else if (startswith(l, "env=")) { + r = deserialize_environment(l + 4, &m->client_environment); + if (r < 0) + log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l); + + } else if ((val = startswith(l, "notify-fd="))) { + int fd; + + if (safe_atoi(val, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_notice("Failed to parse notify fd, ignoring: \"%s\"", val); + else { + m->notify_event_source = sd_event_source_unref(m->notify_event_source); + safe_close(m->notify_fd); + m->notify_fd = fdset_remove(fds, fd); + } + + } else if ((val = startswith(l, "notify-socket="))) { + r = free_and_strdup(&m->notify_socket, val); + if (r < 0) + return r; + + } else if ((val = startswith(l, "cgroups-agent-fd="))) { + int fd; + + if (safe_atoi(val, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_notice("Failed to parse cgroups agent fd, ignoring.: %s", val); + else { + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + safe_close(m->cgroups_agent_fd); + m->cgroups_agent_fd = fdset_remove(fds, fd); + } + + } else if ((val = startswith(l, "user-lookup="))) { + int fd0, fd1; + + if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1)) + log_notice("Failed to parse user lookup fd, ignoring: %s", val); + else { + m->user_lookup_event_source = sd_event_source_unref(m->user_lookup_event_source); + safe_close_pair(m->user_lookup_fds); + m->user_lookup_fds[0] = fdset_remove(fds, fd0); + m->user_lookup_fds[1] = fdset_remove(fds, fd1); + } + + } else if ((val = startswith(l, "dynamic-user="))) + dynamic_user_deserialize_one(m, val, fds); + else if ((val = startswith(l, "destroy-ipc-uid="))) + manager_deserialize_uid_refs_one(m, val); + else if ((val = startswith(l, "destroy-ipc-gid="))) + manager_deserialize_gid_refs_one(m, val); + else if ((val = startswith(l, "exec-runtime="))) + (void) exec_runtime_deserialize_one(m, val, fds); + else if ((val = startswith(l, "subscribed="))) { + + if (strv_extend(&m->deserialized_subscribed, val) < 0) + return -ENOMEM; + + } else { + ManagerTimestamp q; + + for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { + val = startswith(l, manager_timestamp_to_string(q)); + if (!val) + continue; + + val = startswith(val, "-timestamp="); + if (val) + break; + } + + if (q < _MANAGER_TIMESTAMP_MAX) /* found it */ + (void) deserialize_dual_timestamp(val, m->timestamps + q); + else if (!startswith(l, "kdbus-fd=")) /* ignore kdbus */ + log_notice("Unknown serialization item '%s', ignoring.", l); + } + } + + return manager_deserialize_units(m, f, fds); +} + +int manager_reload(Manager *m) { + _cleanup_(manager_reloading_stopp) Manager *reloading = NULL; + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + + r = manager_open_serialization(m, &f); + if (r < 0) + return log_error_errno(r, "Failed to create serialization file: %m"); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + /* We are officially in reload mode from here on. */ + reloading = manager_reloading_start(m); + + r = manager_serialize(m, f, fds, false); + if (r < 0) + return r; + + if (fseeko(f, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to beginning of serialization: %m"); + + /* 💀 This is the point of no return, from here on there is no way back. 💀 */ + reloading = NULL; + + bus_manager_send_reloading(m, true); + + /* Start by flushing out all jobs and units, all generated units, all runtime environments, all dynamic users + * and everything else that is worth flushing out. We'll get it all back from the serialization — if we need + * it.*/ + + manager_clear_jobs_and_units(m); + lookup_paths_flush_generator(&m->lookup_paths); + lookup_paths_free(&m->lookup_paths); + exec_runtime_vacuum(m); + dynamic_user_vacuum(m, false); + m->uid_refs = hashmap_free(m->uid_refs); + m->gid_refs = hashmap_free(m->gid_refs); + + r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL); + if (r < 0) + log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m"); + + (void) manager_run_environment_generators(m); + (void) manager_run_generators(m); + + lookup_paths_log(&m->lookup_paths); + + /* We flushed out generated files, for which we don't watch mtime, so we should flush the old map. */ + manager_free_unit_name_maps(m); + + /* First, enumerate what we can from kernel and suchlike */ + manager_enumerate_perpetual(m); + manager_enumerate(m); + + /* Second, deserialize our stored data */ + r = manager_deserialize(m, f, fds); + if (r < 0) + log_warning_errno(r, "Deserialization failed, proceeding anyway: %m"); + + /* We don't need the serialization anymore */ + f = safe_fclose(f); + + /* Re-register notify_fd as event source, and set up other sockets/communication channels we might need */ + (void) manager_setup_notify(m); + (void) manager_setup_cgroups_agent(m); + (void) manager_setup_user_lookup_fd(m); + + /* Third, fire things up! */ + manager_coldplug(m); + + /* Clean up runtime objects no longer referenced */ + manager_vacuum(m); + + /* Consider the reload process complete now. */ + assert(m->n_reloading > 0); + m->n_reloading--; + + /* On manager reloading, device tag data should exists, thus, we should honor the results of device + * enumeration. The flag should be always set correctly by the serialized data, but it may fail. So, + * let's always set the flag here for safety. */ + m->honor_device_enumeration = true; + + manager_ready(m); + + m->send_reloading_done = true; + return 0; +} + +void manager_reset_failed(Manager *m) { + Unit *u; + + assert(m); + + HASHMAP_FOREACH(u, m->units) + unit_reset_failed(u); +} + +bool manager_unit_inactive_or_pending(Manager *m, const char *name) { + Unit *u; + + assert(m); + assert(name); + + /* Returns true if the unit is inactive or going down */ + u = manager_get_unit(m, name); + if (!u) + return true; + + return unit_inactive_or_pending(u); +} + +static void log_taint_string(Manager *m) { + _cleanup_free_ char *taint = NULL; + + assert(m); + + if (MANAGER_IS_USER(m) || m->taint_logged) + return; + + m->taint_logged = true; /* only check for taint once */ + + taint = manager_taint_string(m); + if (isempty(taint)) + return; + + log_struct(LOG_NOTICE, + LOG_MESSAGE("System is tainted: %s", taint), + "TAINT=%s", taint, + "MESSAGE_ID=" SD_MESSAGE_TAINTED_STR); +} + +static void manager_notify_finished(Manager *m) { + char userspace[FORMAT_TIMESPAN_MAX], initrd[FORMAT_TIMESPAN_MAX], kernel[FORMAT_TIMESPAN_MAX], sum[FORMAT_TIMESPAN_MAX]; + usec_t firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec; + + if (MANAGER_IS_TEST_RUN(m)) + return; + + if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) { + char ts[FORMAT_TIMESPAN_MAX]; + char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")] + = {}; + char *p = buf; + size_t size = sizeof buf; + + /* Note that MANAGER_TIMESTAMP_KERNEL's monotonic value is always at 0, and + * MANAGER_TIMESTAMP_FIRMWARE's and MANAGER_TIMESTAMP_LOADER's monotonic value should be considered + * negative values. */ + + firmware_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic - m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic; + loader_usec = m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + total_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic + m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic; + + if (firmware_usec > 0) + size = strpcpyf(&p, size, "%s (firmware) + ", format_timespan(ts, sizeof(ts), firmware_usec, USEC_PER_MSEC)); + if (loader_usec > 0) + size = strpcpyf(&p, size, "%s (loader) + ", format_timespan(ts, sizeof(ts), loader_usec, USEC_PER_MSEC)); + + if (dual_timestamp_is_set(&m->timestamps[MANAGER_TIMESTAMP_INITRD])) { + + /* The initrd case on bare-metal*/ + kernel_usec = m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + initrd_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "KERNEL_USEC="USEC_FMT, kernel_usec, + "INITRD_USEC="USEC_FMT, initrd_usec, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (initrd) + %s (userspace) = %s.", + buf, + format_timespan(kernel, sizeof(kernel), kernel_usec, USEC_PER_MSEC), + format_timespan(initrd, sizeof(initrd), initrd_usec, USEC_PER_MSEC), + format_timespan(userspace, sizeof(userspace), userspace_usec, USEC_PER_MSEC), + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC))); + } else { + /* The initrd-less case on bare-metal*/ + + kernel_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic; + initrd_usec = 0; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "KERNEL_USEC="USEC_FMT, kernel_usec, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (userspace) = %s.", + buf, + format_timespan(kernel, sizeof(kernel), kernel_usec, USEC_PER_MSEC), + format_timespan(userspace, sizeof(userspace), userspace_usec, USEC_PER_MSEC), + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC))); + } + } else { + /* The container and --user case */ + firmware_usec = loader_usec = initrd_usec = kernel_usec = 0; + total_usec = userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_USER_STARTUP_FINISHED_STR, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Startup finished in %s.", + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC))); + } + + bus_manager_send_finished(m, firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec); + + sd_notifyf(false, + m->ready_sent ? "STATUS=Startup finished in %s." + : "READY=1\n" + "STATUS=Startup finished in %s.", + format_timespan(sum, sizeof(sum), total_usec, USEC_PER_MSEC)); + m->ready_sent = true; + + log_taint_string(m); +} + +static void manager_send_ready(Manager *m) { + assert(m); + + /* We send READY=1 on reaching basic.target only when running in --user mode. */ + if (!MANAGER_IS_USER(m) || m->ready_sent) + return; + + m->ready_sent = true; + + sd_notifyf(false, + "READY=1\n" + "STATUS=Reached " SPECIAL_BASIC_TARGET "."); +} + +static void manager_check_basic_target(Manager *m) { + Unit *u; + + assert(m); + + /* Small shortcut */ + if (m->ready_sent && m->taint_logged) + return; + + u = manager_get_unit(m, SPECIAL_BASIC_TARGET); + if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + /* For user managers, send out READY=1 as soon as we reach basic.target */ + manager_send_ready(m); + + /* Log the taint string as soon as we reach basic.target */ + log_taint_string(m); +} + +void manager_check_finished(Manager *m) { + assert(m); + + if (MANAGER_IS_RELOADING(m)) + return; + + /* Verify that we have entered the event loop already, and not left it again. */ + if (!MANAGER_IS_RUNNING(m)) + return; + + manager_check_basic_target(m); + + if (hashmap_size(m->jobs) > 0) { + if (m->jobs_in_progress_event_source) + /* Ignore any failure, this is only for feedback */ + (void) sd_event_source_set_time(m->jobs_in_progress_event_source, + manager_watch_jobs_next_time(m)); + return; + } + + /* The jobs hashmap tends to grow a lot during boot, and then it's not reused until shutdown. Let's + kill the hashmap if it is relatively large. */ + if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10) + m->jobs = hashmap_free(m->jobs); + + manager_flip_auto_status(m, false, "boot finished"); + + /* Notify Type=idle units that we are done now */ + manager_close_idle_pipe(m); + + /* Turn off confirm spawn now */ + m->confirm_spawn = NULL; + + /* No need to update ask password status when we're going non-interactive */ + manager_close_ask_password(m); + + /* This is no longer the first boot */ + manager_set_first_boot(m, false); + + if (MANAGER_IS_FINISHED(m)) + return; + + dual_timestamp_get(m->timestamps + MANAGER_TIMESTAMP_FINISH); + + manager_notify_finished(m); + + manager_invalidate_startup_units(m); +} + +static bool generator_path_any(const char* const* paths) { + char **path; + bool found = false; + + /* Optimize by skipping the whole process by not creating output directories + * if no generators are found. */ + STRV_FOREACH(path, (char**) paths) + if (access(*path, F_OK) == 0) + found = true; + else if (errno != ENOENT) + log_warning_errno(errno, "Failed to open generator directory %s: %m", *path); + + return found; +} + +static int manager_run_environment_generators(Manager *m) { + char **tmp = NULL; /* this is only used in the forked process, no cleanup here */ + _cleanup_strv_free_ char **paths = NULL; + void* args[] = { + [STDOUT_GENERATE] = &tmp, + [STDOUT_COLLECT] = &tmp, + [STDOUT_CONSUME] = &m->transient_environment, + }; + int r; + + if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_ENV_GENERATORS)) + return 0; + + paths = env_generator_binary_paths(MANAGER_IS_SYSTEM(m)); + if (!paths) + return log_oom(); + + if (!generator_path_any((const char* const*) paths)) + return 0; + + RUN_WITH_UMASK(0022) + r = execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, gather_environment, + args, NULL, m->transient_environment, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + return r; +} + +static int manager_run_generators(Manager *m) { + _cleanup_strv_free_ char **paths = NULL; + const char *argv[5]; + int r; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_GENERATORS)) + return 0; + + paths = generator_binary_paths(m->unit_file_scope); + if (!paths) + return log_oom(); + + if (!generator_path_any((const char* const*) paths)) + return 0; + + r = lookup_paths_mkdir_generator(&m->lookup_paths); + if (r < 0) { + log_error_errno(r, "Failed to create generator directories: %m"); + goto finish; + } + + argv[0] = NULL; /* Leave this empty, execute_directory() will fill something in */ + argv[1] = m->lookup_paths.generator; + argv[2] = m->lookup_paths.generator_early; + argv[3] = m->lookup_paths.generator_late; + argv[4] = NULL; + + RUN_WITH_UMASK(0022) + (void) execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, NULL, NULL, + (char**) argv, m->transient_environment, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + + r = 0; + +finish: + lookup_paths_trim_generator(&m->lookup_paths); + return r; +} + +int manager_transient_environment_add(Manager *m, char **plus) { + char **a; + + assert(m); + + if (strv_isempty(plus)) + return 0; + + a = strv_env_merge(2, m->transient_environment, plus); + if (!a) + return log_oom(); + + sanitize_environment(a); + + return strv_free_and_replace(m->transient_environment, a); +} + +int manager_client_environment_modify( + Manager *m, + char **minus, + char **plus) { + + char **a = NULL, **b = NULL, **l; + + assert(m); + + if (strv_isempty(minus) && strv_isempty(plus)) + return 0; + + l = m->client_environment; + + if (!strv_isempty(minus)) { + a = strv_env_delete(l, 1, minus); + if (!a) + return -ENOMEM; + + l = a; + } + + if (!strv_isempty(plus)) { + b = strv_env_merge(2, l, plus); + if (!b) { + strv_free(a); + return -ENOMEM; + } + + l = b; + } + + if (m->client_environment != l) + strv_free(m->client_environment); + + if (a != l) + strv_free(a); + if (b != l) + strv_free(b); + + m->client_environment = sanitize_environment(l); + return 0; +} + +int manager_get_effective_environment(Manager *m, char ***ret) { + char **l; + + assert(m); + assert(ret); + + l = strv_env_merge(2, m->transient_environment, m->client_environment); + if (!l) + return -ENOMEM; + + *ret = l; + return 0; +} + +int manager_set_default_rlimits(Manager *m, struct rlimit **default_rlimit) { + assert(m); + + for (unsigned i = 0; i < _RLIMIT_MAX; i++) { + m->rlimit[i] = mfree(m->rlimit[i]); + + if (!default_rlimit[i]) + continue; + + m->rlimit[i] = newdup(struct rlimit, default_rlimit[i], 1); + if (!m->rlimit[i]) + return log_oom(); + } + + return 0; +} + +void manager_recheck_dbus(Manager *m) { + assert(m); + + /* Connects to the bus if the dbus service and socket are running. If we are running in user mode this is all + * it does. In system mode we'll also connect to the system bus (which will most likely just reuse the + * connection of the API bus). That's because the system bus after all runs as service of the system instance, + * while in the user instance we can assume it's already there. */ + + if (MANAGER_IS_RELOADING(m)) + return; /* don't check while we are reloading… */ + + if (manager_dbus_is_running(m, false)) { + (void) bus_init_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_init_system(m); + } else { + (void) bus_done_api(m); + + if (MANAGER_IS_SYSTEM(m)) + (void) bus_done_system(m); + } +} + +static bool manager_journal_is_running(Manager *m) { + Unit *u; + + assert(m); + + if (MANAGER_IS_TEST_RUN(m)) + return false; + + /* If we are the user manager we can safely assume that the journal is up */ + if (!MANAGER_IS_SYSTEM(m)) + return true; + + /* Check that the socket is not only up, but in RUNNING state */ + u = manager_get_unit(m, SPECIAL_JOURNALD_SOCKET); + if (!u) + return false; + if (SOCKET(u)->state != SOCKET_RUNNING) + return false; + + /* Similar, check if the daemon itself is fully up, too */ + u = manager_get_unit(m, SPECIAL_JOURNALD_SERVICE); + if (!u) + return false; + if (!IN_SET(SERVICE(u)->state, SERVICE_RELOAD, SERVICE_RUNNING)) + return false; + + return true; +} + +void disable_printk_ratelimit(void) { + /* Disable kernel's printk ratelimit. + * + * Logging to /dev/kmsg is most useful during early boot and shutdown, where normal logging + * mechanisms are not available. The semantics of this sysctl are such that any kernel command-line + * setting takes precedence. */ + int r; + + r = sysctl_write("kernel/printk_devkmsg", "on"); + if (r < 0) + log_debug_errno(r, "Failed to set sysctl kernel.printk_devkmsg=on: %m"); +} + +void manager_recheck_journal(Manager *m) { + + assert(m); + + /* Don't bother with this unless we are in the special situation of being PID 1 */ + if (getpid_cached() != 1) + return; + + /* Don't check this while we are reloading, things might still change */ + if (MANAGER_IS_RELOADING(m)) + return; + + /* The journal is fully and entirely up? If so, let's permit logging to it, if that's configured. If the + * journal is down, don't ever log to it, otherwise we might end up deadlocking ourselves as we might trigger + * an activation ourselves we can't fulfill. */ + log_set_prohibit_ipc(!manager_journal_is_running(m)); + log_open(); +} + +static ShowStatus manager_get_show_status(Manager *m) { + assert(m); + + if (MANAGER_IS_USER(m)) + return _SHOW_STATUS_INVALID; + + if (m->show_status_overridden != _SHOW_STATUS_INVALID) + return m->show_status_overridden; + + return m->show_status; +} + +bool manager_get_show_status_on(Manager *m) { + assert(m); + + return show_status_on(manager_get_show_status(m)); +} + +static void set_show_status_marker(bool b) { + if (b) + (void) touch("/run/systemd/show-status"); + else + (void) unlink("/run/systemd/show-status"); +} + +void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason) { + assert(m); + assert(reason); + assert(mode >= 0 && mode < _SHOW_STATUS_MAX); + + if (MANAGER_IS_USER(m)) + return; + + if (mode == m->show_status) + return; + + if (m->show_status_overridden == _SHOW_STATUS_INVALID) { + bool enabled; + + enabled = show_status_on(mode); + log_debug("%s (%s) showing of status (%s).", + enabled ? "Enabling" : "Disabling", + strna(show_status_to_string(mode)), + reason); + + set_show_status_marker(enabled); + } + + m->show_status = mode; +} + +void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason) { + assert(m); + assert(mode < _SHOW_STATUS_MAX); + + if (MANAGER_IS_USER(m)) + return; + + if (mode == m->show_status_overridden) + return; + + m->show_status_overridden = mode; + + if (mode == _SHOW_STATUS_INVALID) + mode = m->show_status; + + log_debug("%s (%s) showing of status (%s).", + m->show_status_overridden != _SHOW_STATUS_INVALID ? "Overriding" : "Restoring", + strna(show_status_to_string(mode)), + reason); + + set_show_status_marker(show_status_on(mode)); +} + +const char *manager_get_confirm_spawn(Manager *m) { + static int last_errno = 0; + struct stat st; + int r; + + assert(m); + + /* Here's the deal: we want to test the validity of the console but don't want + * PID1 to go through the whole console process which might block. But we also + * want to warn the user only once if something is wrong with the console so we + * cannot do the sanity checks after spawning our children. So here we simply do + * really basic tests to hopefully trap common errors. + * + * If the console suddenly disappear at the time our children will really it + * then they will simply fail to acquire it and a positive answer will be + * assumed. New children will fall back to /dev/console though. + * + * Note: TTYs are devices that can come and go any time, and frequently aren't + * available yet during early boot (consider a USB rs232 dongle...). If for any + * reason the configured console is not ready, we fall back to the default + * console. */ + + if (!m->confirm_spawn || path_equal(m->confirm_spawn, "/dev/console")) + return m->confirm_spawn; + + if (stat(m->confirm_spawn, &st) < 0) { + r = -errno; + goto fail; + } + + if (!S_ISCHR(st.st_mode)) { + r = -ENOTTY; + goto fail; + } + + last_errno = 0; + return m->confirm_spawn; + +fail: + if (last_errno != r) + last_errno = log_warning_errno(r, "Failed to open %s, using default console: %m", m->confirm_spawn); + + return "/dev/console"; +} + +void manager_set_first_boot(Manager *m, bool b) { + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return; + + if (m->first_boot != (int) b) { + if (b) + (void) touch("/run/systemd/first-boot"); + else + (void) unlink("/run/systemd/first-boot"); + } + + m->first_boot = b; +} + +void manager_disable_confirm_spawn(void) { + (void) touch("/run/systemd/confirm_spawn_disabled"); +} + +bool manager_is_confirm_spawn_disabled(Manager *m) { + if (!m->confirm_spawn) + return true; + + return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0; +} + +static bool manager_should_show_status(Manager *m, StatusType type) { + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return false; + + if (m->no_console_output) + return false; + + if (!IN_SET(manager_state(m), MANAGER_INITIALIZING, MANAGER_STARTING, MANAGER_STOPPING)) + return false; + + /* If we cannot find out the status properly, just proceed. */ + if (type != STATUS_TYPE_EMERGENCY && manager_check_ask_password(m) > 0) + return false; + + if (type == STATUS_TYPE_NOTICE && m->show_status != SHOW_STATUS_NO) + return true; + + return manager_get_show_status_on(m); +} + +void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) { + va_list ap; + + /* If m is NULL, assume we're after shutdown and let the messages through. */ + + if (m && !manager_should_show_status(m, type)) + return; + + /* XXX We should totally drop the check for ephemeral here + * and thus effectively make 'Type=idle' pointless. */ + if (type == STATUS_TYPE_EPHEMERAL && m && m->n_on_console > 0) + return; + + va_start(ap, format); + status_vprintf(status, SHOW_STATUS_ELLIPSIZE|(type == STATUS_TYPE_EPHEMERAL ? SHOW_STATUS_EPHEMERAL : 0), format, ap); + va_end(ap); +} + +Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path) { + char p[strlen(path)+1]; + + assert(m); + assert(path); + + strcpy(p, path); + path_simplify(p, false); + + return hashmap_get(m->units_requiring_mounts_for, streq(p, "/") ? "" : p); +} + +int manager_update_failed_units(Manager *m, Unit *u, bool failed) { + unsigned size; + int r; + + assert(m); + assert(u->manager == m); + + size = set_size(m->failed_units); + + if (failed) { + r = set_ensure_put(&m->failed_units, NULL, u); + if (r < 0) + return log_oom(); + } else + (void) set_remove(m->failed_units, u); + + if (set_size(m->failed_units) != size) + bus_manager_send_change_signal(m); + + return 0; +} + +ManagerState manager_state(Manager *m) { + Unit *u; + + assert(m); + + /* Is the special shutdown target active or queued? If so, we are in shutdown state */ + u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_STOPPING; + + /* Did we ever finish booting? If not then we are still starting up */ + if (!MANAGER_IS_FINISHED(m)) { + + u = manager_get_unit(m, SPECIAL_BASIC_TARGET); + if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return MANAGER_INITIALIZING; + + return MANAGER_STARTING; + } + + if (MANAGER_IS_SYSTEM(m)) { + /* Are the rescue or emergency targets active or queued? If so we are in maintenance state */ + u = manager_get_unit(m, SPECIAL_RESCUE_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_MAINTENANCE; + + u = manager_get_unit(m, SPECIAL_EMERGENCY_TARGET); + if (u && unit_active_or_pending(u)) + return MANAGER_MAINTENANCE; + } + + /* Are there any failed units? If so, we are in degraded mode */ + if (set_size(m->failed_units) > 0) + return MANAGER_DEGRADED; + + return MANAGER_RUNNING; +} + +static void manager_unref_uid_internal( + Manager *m, + Hashmap **uid_refs, + uid_t uid, + bool destroy_now, + int (*_clean_ipc)(uid_t uid)) { + + uint32_t c, n; + + assert(m); + assert(uid_refs); + assert(uid_is_valid(uid)); + assert(_clean_ipc); + + /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the assumption + * that uid_t and gid_t are actually defined the same way, with the same validity rules. + * + * We store a hashmap where the UID/GID is they key and the value is a 32bit reference counter, whose highest + * bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last reference to the UID/GID + * is dropped. The flag is set to on, once at least one reference from a unit where RemoveIPC= is set is added + * on a UID/GID. It is reset when the UID's/GID's reference counter drops to 0 again. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + assert(n > 0); + n--; + + if (destroy_now && n == 0) { + hashmap_remove(*uid_refs, UID_TO_PTR(uid)); + + if (c & DESTROY_IPC_FLAG) { + log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + } else { + c = n | (c & DESTROY_IPC_FLAG); + assert_se(hashmap_update(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0); + } +} + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) { + manager_unref_uid_internal(m, &m->uid_refs, uid, destroy_now, clean_ipc_by_uid); +} + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) { + manager_unref_uid_internal(m, &m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid); +} + +static int manager_ref_uid_internal( + Manager *m, + Hashmap **uid_refs, + uid_t uid, + bool clean_ipc) { + + uint32_t c, n; + int r; + + assert(m); + assert(uid_refs); + assert(uid_is_valid(uid)); + + /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the assumption + * that uid_t and gid_t are actually defined the same way, with the same validity rules. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return 0; + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) + return r; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + n++; + + if (n & DESTROY_IPC_FLAG) /* check for overflow */ + return -EOVERFLOW; + + c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0); + + return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); +} + +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) { + return manager_ref_uid_internal(m, &m->uid_refs, uid, clean_ipc); +} + +int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) { + return manager_ref_uid_internal(m, &m->gid_refs, (uid_t) gid, clean_ipc); +} + +static void manager_vacuum_uid_refs_internal( + Manager *m, + Hashmap **uid_refs, + int (*_clean_ipc)(uid_t uid)) { + + void *p, *k; + + assert(m); + assert(uid_refs); + assert(_clean_ipc); + + HASHMAP_FOREACH_KEY(p, k, *uid_refs) { + uint32_t c, n; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + n = c & ~DESTROY_IPC_FLAG; + if (n > 0) + continue; + + if (c & DESTROY_IPC_FLAG) { + log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + + assert_se(hashmap_remove(*uid_refs, k) == p); + } +} + +static void manager_vacuum_uid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m, &m->uid_refs, clean_ipc_by_uid); +} + +static void manager_vacuum_gid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m, &m->gid_refs, clean_ipc_by_gid); +} + +static void manager_vacuum(Manager *m) { + assert(m); + + /* Release any dynamic users no longer referenced */ + dynamic_user_vacuum(m, true); + + /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */ + manager_vacuum_uid_refs(m); + manager_vacuum_gid_refs(m); + + /* Release any runtimes no longer referenced */ + exec_runtime_vacuum(m); +} + +int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + struct buffer { + uid_t uid; + gid_t gid; + char unit_name[UNIT_NAME_MAX+1]; + } _packed_ buffer; + + Manager *m = userdata; + ssize_t l; + size_t n; + Unit *u; + + assert_se(source); + assert_se(m); + + /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the resulting UID/GID + * in a datagram. We parse the datagram here and pass it off to the unit, so that it can add a reference to the + * UID/GID so that it can destroy the UID/GID's IPC objects when the reference counter drops to 0. */ + + l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT); + if (l < 0) { + if (IN_SET(errno, EINTR, EAGAIN)) + return 0; + + return log_error_errno(errno, "Failed to read from user lookup fd: %m"); + } + + if ((size_t) l <= offsetof(struct buffer, unit_name)) { + log_warning("Received too short user lookup message, ignoring."); + return 0; + } + + if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) { + log_warning("Received too long user lookup message, ignoring."); + return 0; + } + + if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) { + log_warning("Got user lookup message with invalid UID/GID pair, ignoring."); + return 0; + } + + n = (size_t) l - offsetof(struct buffer, unit_name); + if (memchr(buffer.unit_name, 0, n)) { + log_warning("Received lookup message with embedded NUL character, ignoring."); + return 0; + } + + buffer.unit_name[n] = 0; + u = manager_get_unit(m, buffer.unit_name); + if (!u) { + log_debug("Got user lookup message but unit doesn't exist, ignoring."); + return 0; + } + + log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid); + + unit_notify_user_lookup(u, buffer.uid, buffer.gid); + return 0; +} + +char *manager_taint_string(Manager *m) { + _cleanup_free_ char *destination = NULL, *overflowuid = NULL, *overflowgid = NULL; + char *buf, *e; + int r; + + /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". + * Only things that are detected at runtime should be tagged + * here. For stuff that is set during compilation, emit a warning + * in the configuration phase. */ + + assert(m); + + buf = new(char, sizeof("split-usr:" + "cgroups-missing:" + "local-hwclock:" + "var-run-bad:" + "overflowuid-not-65534:" + "overflowgid-not-65534:")); + if (!buf) + return NULL; + + e = buf; + buf[0] = 0; + + if (m->taint_usr) + e = stpcpy(e, "split-usr:"); + + if (access("/proc/cgroups", F_OK) < 0) + e = stpcpy(e, "cgroups-missing:"); + + if (clock_is_localtime(NULL) > 0) + e = stpcpy(e, "local-hwclock:"); + + r = readlink_malloc("/var/run", &destination); + if (r < 0 || !PATH_IN_SET(destination, "../run", "/run")) + e = stpcpy(e, "var-run-bad:"); + + r = read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid); + if (r >= 0 && !streq(overflowuid, "65534")) + e = stpcpy(e, "overflowuid-not-65534:"); + + r = read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid); + if (r >= 0 && !streq(overflowgid, "65534")) + e = stpcpy(e, "overflowgid-not-65534:"); + + /* remove the last ':' */ + if (e != buf) + e[-1] = 0; + + return buf; +} + +void manager_ref_console(Manager *m) { + assert(m); + + m->n_on_console++; +} + +void manager_unref_console(Manager *m) { + + assert(m->n_on_console > 0); + m->n_on_console--; + + if (m->n_on_console == 0) + m->no_console_output = false; /* unset no_console_output flag, since the console is definitely free now */ +} + +void manager_override_log_level(Manager *m, int level) { + _cleanup_free_ char *s = NULL; + assert(m); + + if (!m->log_level_overridden) { + m->original_log_level = log_get_max_level(); + m->log_level_overridden = true; + } + + (void) log_level_to_string_alloc(level, &s); + log_info("Setting log level to %s.", strna(s)); + + log_set_max_level(level); +} + +void manager_restore_original_log_level(Manager *m) { + _cleanup_free_ char *s = NULL; + assert(m); + + if (!m->log_level_overridden) + return; + + (void) log_level_to_string_alloc(m->original_log_level, &s); + log_info("Restoring log level to original (%s).", strna(s)); + + log_set_max_level(m->original_log_level); + m->log_level_overridden = false; +} + +void manager_override_log_target(Manager *m, LogTarget target) { + assert(m); + + if (!m->log_target_overridden) { + m->original_log_target = log_get_target(); + m->log_target_overridden = true; + } + + log_info("Setting log target to %s.", log_target_to_string(target)); + log_set_target(target); +} + +void manager_restore_original_log_target(Manager *m) { + assert(m); + + if (!m->log_target_overridden) + return; + + log_info("Restoring log target to original %s.", log_target_to_string(m->original_log_target)); + + log_set_target(m->original_log_target); + m->log_target_overridden = false; +} + +ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s) { + if (in_initrd() && + s >= MANAGER_TIMESTAMP_SECURITY_START && + s <= MANAGER_TIMESTAMP_UNITS_LOAD_FINISH) + return s - MANAGER_TIMESTAMP_SECURITY_START + MANAGER_TIMESTAMP_INITRD_SECURITY_START; + return s; +} + +static const char *const manager_state_table[_MANAGER_STATE_MAX] = { + [MANAGER_INITIALIZING] = "initializing", + [MANAGER_STARTING] = "starting", + [MANAGER_RUNNING] = "running", + [MANAGER_DEGRADED] = "degraded", + [MANAGER_MAINTENANCE] = "maintenance", + [MANAGER_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState); + +static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { + [MANAGER_TIMESTAMP_FIRMWARE] = "firmware", + [MANAGER_TIMESTAMP_LOADER] = "loader", + [MANAGER_TIMESTAMP_KERNEL] = "kernel", + [MANAGER_TIMESTAMP_INITRD] = "initrd", + [MANAGER_TIMESTAMP_USERSPACE] = "userspace", + [MANAGER_TIMESTAMP_FINISH] = "finish", + [MANAGER_TIMESTAMP_SECURITY_START] = "security-start", + [MANAGER_TIMESTAMP_SECURITY_FINISH] = "security-finish", + [MANAGER_TIMESTAMP_GENERATORS_START] = "generators-start", + [MANAGER_TIMESTAMP_GENERATORS_FINISH] = "generators-finish", + [MANAGER_TIMESTAMP_UNITS_LOAD_START] = "units-load-start", + [MANAGER_TIMESTAMP_UNITS_LOAD_FINISH] = "units-load-finish", + [MANAGER_TIMESTAMP_INITRD_SECURITY_START] = "initrd-security-start", + [MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH] = "initrd-security-finish", + [MANAGER_TIMESTAMP_INITRD_GENERATORS_START] = "initrd-generators-start", + [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish", + [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start", + [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp); + +static const char* const oom_policy_table[_OOM_POLICY_MAX] = { + [OOM_CONTINUE] = "continue", + [OOM_STOP] = "stop", + [OOM_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy); diff --git a/src/core/manager.h b/src/core/manager.h new file mode 100644 index 0000000..19df889 --- /dev/null +++ b/src/core/manager.h @@ -0,0 +1,570 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include <stdio.h> + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-event.h" + +#include "cgroup-util.h" +#include "cgroup.h" +#include "fdset.h" +#include "hashmap.h" +#include "ip-address-access.h" +#include "list.h" +#include "prioq.h" +#include "ratelimit.h" +#include "varlink.h" + +struct libmnt_monitor; +typedef struct Unit Unit; + +/* Enforce upper limit how many names we allow */ +#define MANAGER_MAX_NAMES 131072 /* 128K */ + +typedef struct Manager Manager; + +/* An externally visible state. We don't actually maintain this as state variable, but derive it from various fields + * when requested */ +typedef enum ManagerState { + MANAGER_INITIALIZING, + MANAGER_STARTING, + MANAGER_RUNNING, + MANAGER_DEGRADED, + MANAGER_MAINTENANCE, + MANAGER_STOPPING, + _MANAGER_STATE_MAX, + _MANAGER_STATE_INVALID = -1 +} ManagerState; + +typedef enum ManagerObjective { + MANAGER_OK, + MANAGER_EXIT, + MANAGER_RELOAD, + MANAGER_REEXECUTE, + MANAGER_REBOOT, + MANAGER_POWEROFF, + MANAGER_HALT, + MANAGER_KEXEC, + MANAGER_SWITCH_ROOT, + _MANAGER_OBJECTIVE_MAX, + _MANAGER_OBJECTIVE_INVALID = -1 +} ManagerObjective; + +typedef enum StatusType { + STATUS_TYPE_EPHEMERAL, + STATUS_TYPE_NORMAL, + STATUS_TYPE_NOTICE, + STATUS_TYPE_EMERGENCY, +} StatusType; + +typedef enum OOMPolicy { + OOM_CONTINUE, /* The kernel kills the process it wants to kill, and that's it */ + OOM_STOP, /* The kernel kills the process it wants to kill, and we stop the unit */ + OOM_KILL, /* The kernel kills the process it wants to kill, and all others in the unit, and we stop the unit */ + _OOM_POLICY_MAX, + _OOM_POLICY_INVALID = -1 +} OOMPolicy; + +/* Notes: + * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD, + * TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when + * the manager is system and not running under container environment. + * + * 2. The monotonic timestamp of TIMESTAMP_KERNEL is always zero. + * + * 3. The realtime timestamp of TIMESTAMP_KERNEL will be unset if the system does not + * have RTC. + * + * 4. TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER will be unset if the system does not + * have RTC, or systemd is built without EFI support. + * + * 5. The monotonic timestamps of TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER are stored as + * negative of the actual value. + * + * 6. TIMESTAMP_USERSPACE is the timestamp of when the manager was started. + * + * 7. TIMESTAMP_INITRD_* are set only when the system is booted with an initrd. + */ + +typedef enum ManagerTimestamp { + MANAGER_TIMESTAMP_FIRMWARE, + MANAGER_TIMESTAMP_LOADER, + MANAGER_TIMESTAMP_KERNEL, + MANAGER_TIMESTAMP_INITRD, + MANAGER_TIMESTAMP_USERSPACE, + MANAGER_TIMESTAMP_FINISH, + + MANAGER_TIMESTAMP_SECURITY_START, + MANAGER_TIMESTAMP_SECURITY_FINISH, + MANAGER_TIMESTAMP_GENERATORS_START, + MANAGER_TIMESTAMP_GENERATORS_FINISH, + MANAGER_TIMESTAMP_UNITS_LOAD_START, + MANAGER_TIMESTAMP_UNITS_LOAD_FINISH, + + MANAGER_TIMESTAMP_INITRD_SECURITY_START, + MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH, + MANAGER_TIMESTAMP_INITRD_GENERATORS_START, + MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH, + MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START, + MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH, + _MANAGER_TIMESTAMP_MAX, + _MANAGER_TIMESTAMP_INVALID = -1, +} ManagerTimestamp; + +typedef enum WatchdogType { + WATCHDOG_RUNTIME, + WATCHDOG_REBOOT, + WATCHDOG_KEXEC, + _WATCHDOG_TYPE_MAX, +} WatchdogType; + +#include "execute.h" +#include "job.h" +#include "path-lookup.h" +#include "show-status.h" +#include "unit-name.h" + +typedef enum ManagerTestRunFlags { + MANAGER_TEST_NORMAL = 0, /* run normally */ + MANAGER_TEST_RUN_MINIMAL = 1 << 0, /* create basic data structures */ + MANAGER_TEST_RUN_BASIC = 1 << 1, /* interact with the environment */ + MANAGER_TEST_RUN_ENV_GENERATORS = 1 << 2, /* also run env generators */ + MANAGER_TEST_RUN_GENERATORS = 1 << 3, /* also run unit generators */ + MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS, +} ManagerTestRunFlags; + +assert_cc((MANAGER_TEST_FULL & UINT8_MAX) == MANAGER_TEST_FULL); + +struct Manager { + /* Note that the set of units we know of is allowed to be + * inconsistent. However the subset of it that is loaded may + * not, and the list of jobs may neither. */ + + /* Active jobs and units */ + Hashmap *units; /* name string => Unit object n:1 */ + Hashmap *units_by_invocation_id; + Hashmap *jobs; /* job id => Job object 1:1 */ + + /* To make it easy to iterate through the units of a specific + * type we maintain a per type linked list */ + LIST_HEAD(Unit, units_by_type[_UNIT_TYPE_MAX]); + + /* Units that need to be loaded */ + LIST_HEAD(Unit, load_queue); /* this is actually more a stack than a queue, but uh. */ + + /* Jobs that need to be run */ + struct Prioq *run_queue; + + /* Units and jobs that have not yet been announced via + * D-Bus. When something about a job changes it is added here + * if it is not in there yet. This allows easy coalescing of + * D-Bus change signals. */ + LIST_HEAD(Unit, dbus_unit_queue); + LIST_HEAD(Job, dbus_job_queue); + + /* Units to remove */ + LIST_HEAD(Unit, cleanup_queue); + + /* Units and jobs to check when doing GC */ + LIST_HEAD(Unit, gc_unit_queue); + LIST_HEAD(Job, gc_job_queue); + + /* Units that should be realized */ + LIST_HEAD(Unit, cgroup_realize_queue); + + /* Units whose cgroup ran empty */ + LIST_HEAD(Unit, cgroup_empty_queue); + + /* Units whose memory.event fired */ + LIST_HEAD(Unit, cgroup_oom_queue); + + /* Target units whose default target dependencies haven't been set yet */ + LIST_HEAD(Unit, target_deps_queue); + + /* Units that might be subject to StopWhenUnneeded= clean-up */ + LIST_HEAD(Unit, stop_when_unneeded_queue); + + sd_event *event; + + /* This maps PIDs we care about to units that are interested in. We allow multiple units to he interested in + * the same PID and multiple PIDs to be relevant to the same unit. Since in most cases only a single unit will + * be interested in the same PID we use a somewhat special encoding here: the first unit interested in a PID is + * stored directly in the hashmap, keyed by the PID unmodified. If there are other units interested too they'll + * be stored in a NULL-terminated array, and keyed by the negative PID. This is safe as pid_t is signed and + * negative PIDs are not used for regular processes but process groups, which we don't care about in this + * context, but this allows us to use the negative range for our own purposes. */ + Hashmap *watch_pids; /* pid => unit as well as -pid => array of units */ + + /* A set contains all units which cgroup should be refreshed after startup */ + Set *startup_units; + + /* A set which contains all currently failed units */ + Set *failed_units; + + sd_event_source *run_queue_event_source; + + char *notify_socket; + int notify_fd; + sd_event_source *notify_event_source; + + int cgroups_agent_fd; + sd_event_source *cgroups_agent_event_source; + + int signal_fd; + sd_event_source *signal_event_source; + + sd_event_source *sigchld_event_source; + + int time_change_fd; + sd_event_source *time_change_event_source; + + sd_event_source *timezone_change_event_source; + + sd_event_source *jobs_in_progress_event_source; + + int user_lookup_fds[2]; + sd_event_source *user_lookup_event_source; + + UnitFileScope unit_file_scope; + LookupPaths lookup_paths; + Hashmap *unit_id_map; + Hashmap *unit_name_map; + Set *unit_path_cache; + uint64_t unit_cache_timestamp_hash; + + char **transient_environment; /* The environment, as determined from config files, kernel cmdline and environment generators */ + char **client_environment; /* Environment variables created by clients through the bus API */ + + usec_t watchdog[_WATCHDOG_TYPE_MAX]; + usec_t watchdog_overridden[_WATCHDOG_TYPE_MAX]; + + bool runtime_watchdog_running; /* Whether the runtime HW watchdog was started, so we know if we still need to get the real timeout from the hardware */ + + dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX]; + + /* Data specific to the device subsystem */ + sd_device_monitor *device_monitor; + Hashmap *devices_by_sysfs; + + /* Data specific to the mount subsystem */ + struct libmnt_monitor *mount_monitor; + sd_event_source *mount_event_source; + + /* Data specific to the swap filesystem */ + FILE *proc_swaps; + sd_event_source *swap_event_source; + Hashmap *swaps_by_devnode; + + /* Data specific to the D-Bus subsystem */ + sd_bus *api_bus, *system_bus; + Set *private_buses; + int private_listen_fd; + sd_event_source *private_listen_event_source; + + /* Contains all the clients that are subscribed to signals via + the API bus. Note that private bus connections are always + considered subscribes, since they last for very short only, + and it is much simpler that way. */ + sd_bus_track *subscribed; + char **deserialized_subscribed; + + /* This is used during reloading: before the reload we queue + * the reply message here, and afterwards we send it */ + sd_bus_message *pending_reload_message; + + Hashmap *watch_bus; /* D-Bus names => Unit object n:1 */ + + bool send_reloading_done; + + uint32_t current_job_id; + uint32_t default_unit_job_id; + + /* Data specific to the Automount subsystem */ + int dev_autofs_fd; + + /* Data specific to the cgroup subsystem */ + Hashmap *cgroup_unit; + CGroupMask cgroup_supported; + char *cgroup_root; + + /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */ + int cgroup_inotify_fd; + sd_event_source *cgroup_inotify_event_source; + + /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and + * memory.events cgroupv2 attributes. */ + Hashmap *cgroup_control_inotify_wd_unit; + Hashmap *cgroup_memory_inotify_wd_unit; + + /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */ + sd_event_source *cgroup_empty_event_source; + sd_event_source *cgroup_oom_event_source; + + /* Make sure the user cannot accidentally unmount our cgroup + * file system */ + int pin_cgroupfs_fd; + + unsigned gc_marker; + + /* The stat() data the last time we saw /etc/localtime */ + usec_t etc_localtime_mtime; + bool etc_localtime_accessible:1; + + ManagerObjective objective:5; + + /* Flags */ + bool dispatching_load_queue:1; + + bool taint_usr:1; + + /* Have we already sent out the READY=1 notification? */ + bool ready_sent:1; + + /* Have we already printed the taint line if necessary? */ + bool taint_logged:1; + + /* Have we ever changed the "kernel.pid_max" sysctl? */ + bool sysctl_pid_max_changed:1; + + ManagerTestRunFlags test_run_flags:8; + + /* If non-zero, exit with the following value when the systemd + * process terminate. Useful for containers: systemd-nspawn could get + * the return value. */ + uint8_t return_value; + + ShowStatus show_status; + ShowStatus show_status_overridden; + StatusUnitFormat status_unit_format; + char *confirm_spawn; + bool no_console_output; + bool service_watchdogs; + + ExecOutput default_std_output, default_std_error; + + usec_t default_restart_usec, default_timeout_start_usec, default_timeout_stop_usec; + usec_t default_timeout_abort_usec; + bool default_timeout_abort_set; + + usec_t default_start_limit_interval; + unsigned default_start_limit_burst; + + bool default_cpu_accounting; + bool default_memory_accounting; + bool default_io_accounting; + bool default_blockio_accounting; + bool default_tasks_accounting; + bool default_ip_accounting; + + TasksMax default_tasks_max; + usec_t default_timer_accuracy_usec; + + OOMPolicy default_oom_policy; + + int original_log_level; + LogTarget original_log_target; + bool log_level_overridden:1; + bool log_target_overridden:1; + + struct rlimit *rlimit[_RLIMIT_MAX]; + + /* non-zero if we are reloading or reexecuting, */ + int n_reloading; + + unsigned n_installed_jobs; + unsigned n_failed_jobs; + + /* Jobs in progress watching */ + unsigned n_running_jobs; + unsigned n_on_console; + unsigned jobs_in_progress_iteration; + + /* Do we have any outstanding password prompts? */ + int have_ask_password; + int ask_password_inotify_fd; + sd_event_source *ask_password_event_source; + + /* Type=idle pipes */ + int idle_pipe[4]; + sd_event_source *idle_pipe_event_source; + + char *switch_root; + char *switch_root_init; + + /* This maps all possible path prefixes to the units needing + * them. It's a hashmap with a path string as key and a Set as + * value where Unit objects are contained. */ + Hashmap *units_requiring_mounts_for; + + /* Used for processing polkit authorization responses */ + Hashmap *polkit_registry; + + /* Dynamic users/groups, indexed by their name */ + Hashmap *dynamic_users; + + /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */ + Hashmap *uid_refs; + Hashmap *gid_refs; + + /* ExecRuntime, indexed by their owner unit id */ + Hashmap *exec_runtime_by_id; + + /* When the user hits C-A-D more than 7 times per 2s, do something immediately... */ + RateLimit ctrl_alt_del_ratelimit; + EmergencyAction cad_burst_action; + + const char *unit_log_field; + const char *unit_log_format_string; + + const char *invocation_log_field; + const char *invocation_log_format_string; + + int first_boot; /* tri-state */ + + /* Prefixes of e.g. RuntimeDirectory= */ + char *prefix[_EXEC_DIRECTORY_TYPE_MAX]; + char *received_credentials; + + /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; + + bool honor_device_enumeration; + + VarlinkServer *varlink_server; + /* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */ + Varlink *managed_oom_varlink_request; +}; + +static inline usec_t manager_default_timeout_abort_usec(Manager *m) { + assert(m); + return m->default_timeout_abort_set ? m->default_timeout_abort_usec : m->default_timeout_stop_usec; +} + +#define MANAGER_IS_SYSTEM(m) ((m)->unit_file_scope == UNIT_FILE_SYSTEM) +#define MANAGER_IS_USER(m) ((m)->unit_file_scope != UNIT_FILE_SYSTEM) + +#define MANAGER_IS_RELOADING(m) ((m)->n_reloading > 0) + +#define MANAGER_IS_FINISHED(m) (dual_timestamp_is_set((m)->timestamps + MANAGER_TIMESTAMP_FINISH)) + +/* The objective is set to OK as soon as we enter the main loop, and set otherwise as soon as we are done with it */ +#define MANAGER_IS_RUNNING(m) ((m)->objective == MANAGER_OK) + +#define MANAGER_IS_TEST_RUN(m) ((m)->test_run_flags != 0) + +int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager **m); +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_startup(Manager *m, FILE *serialization, FDSet *fds); + +Job *manager_get_job(Manager *m, uint32_t id); +Unit *manager_get_unit(Manager *m, const char *name); + +int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j); + +bool manager_unit_cache_should_retry_load(Unit *u); +int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **_ret); +int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **_ret); +int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret); +int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u); + +int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret); +int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret); +int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret); +int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e); + +void manager_dump_units(Manager *s, FILE *f, const char *prefix); +void manager_dump_jobs(Manager *s, FILE *f, const char *prefix); +void manager_dump(Manager *s, FILE *f, const char *prefix); +int manager_get_dump_string(Manager *m, char **ret); + +void manager_clear_jobs(Manager *m); + +void manager_unwatch_pid(Manager *m, pid_t pid); + +unsigned manager_dispatch_load_queue(Manager *m); + +int manager_default_environment(Manager *m); +int manager_transient_environment_add(Manager *m, char **plus); +int manager_client_environment_modify(Manager *m, char **minus, char **plus); +int manager_get_effective_environment(Manager *m, char ***ret); + +int manager_set_default_rlimits(Manager *m, struct rlimit **default_rlimit); + +int manager_loop(Manager *m); + +int manager_open_serialization(Manager *m, FILE **_f); + +int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root); +int manager_deserialize(Manager *m, FILE *f, FDSet *fds); + +int manager_reload(Manager *m); + +void manager_reset_failed(Manager *m); + +void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success); +void manager_send_unit_plymouth(Manager *m, Unit *u); + +bool manager_unit_inactive_or_pending(Manager *m, const char *name); + +void manager_check_finished(Manager *m); + +void disable_printk_ratelimit(void); +void manager_recheck_dbus(Manager *m); +void manager_recheck_journal(Manager *m); + +bool manager_get_show_status_on(Manager *m); +void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason); +void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason); + +void manager_set_first_boot(Manager *m, bool b); + +void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5); + +Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path); + +ManagerState manager_state(Manager *m); + +int manager_update_failed_units(Manager *m, Unit *u, bool failed); + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now); +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc); + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now); +int manager_ref_gid(Manager *m, gid_t gid, bool destroy_now); + +char *manager_taint_string(Manager *m); + +void manager_ref_console(Manager *m); +void manager_unref_console(Manager *m); + +void manager_override_log_level(Manager *m, int level); +void manager_restore_original_log_level(Manager *m); + +void manager_override_log_target(Manager *m, LogTarget target); +void manager_restore_original_log_target(Manager *m); + +const char *manager_state_to_string(ManagerState m) _const_; +ManagerState manager_state_from_string(const char *s) _pure_; + +const char *manager_get_confirm_spawn(Manager *m); +bool manager_is_confirm_spawn_disabled(Manager *m); +void manager_disable_confirm_spawn(void); + +const char *manager_timestamp_to_string(ManagerTimestamp m) _const_; +ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_; +ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s); + +usec_t manager_get_watchdog(Manager *m, WatchdogType t); +void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout); +int manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout); +void manager_retry_runtime_watchdog(Manager *m); + +const char* oom_policy_to_string(OOMPolicy i) _const_; +OOMPolicy oom_policy_from_string(const char *s) _pure_; diff --git a/src/core/meson.build b/src/core/meson.build new file mode 100644 index 0000000..77767eb --- /dev/null +++ b/src/core/meson.build @@ -0,0 +1,226 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libcore_shared_sources = ''' + killall.c + killall.h + loopback-setup.c + loopback-setup.h + machine-id-setup.c + machine-id-setup.h + mount-setup.c + mount-setup.h +'''.split() + +libcore_sources = ''' + apparmor-setup.c + apparmor-setup.h + audit-fd.c + audit-fd.h + automount.c + automount.h + bpf-devices.c + bpf-devices.h + bpf-firewall.c + bpf-firewall.h + cgroup.c + cgroup.h + core-varlink.c + core-varlink.h + dbus-automount.c + dbus-automount.h + dbus-cgroup.c + dbus-cgroup.h + dbus-device.c + dbus-device.h + dbus-execute.c + dbus-execute.h + dbus-job.c + dbus-job.h + dbus-kill.c + dbus-kill.h + dbus-manager.c + dbus-manager.h + dbus-mount.c + dbus-mount.h + dbus-path.c + dbus-path.h + dbus-scope.c + dbus-scope.h + dbus-service.c + dbus-service.h + dbus-slice.c + dbus-slice.h + dbus-socket.c + dbus-socket.h + dbus-swap.c + dbus-swap.h + dbus-target.c + dbus-target.h + dbus-timer.c + dbus-timer.h + dbus-unit.c + dbus-unit.h + dbus-util.c + dbus-util.h + dbus.c + dbus.h + device.c + device.h + dynamic-user.c + dynamic-user.h + efi-random.c + efi-random.h + emergency-action.c + emergency-action.h + execute.c + execute.h + generator-setup.c + generator-setup.h + hostname-setup.c + hostname-setup.h + ima-setup.c + ima-setup.h + ip-address-access.c + ip-address-access.h + job.c + job.h + kill.c + kill.h + kmod-setup.c + kmod-setup.h + load-dropin.c + load-dropin.h + load-fragment.c + load-fragment.h + locale-setup.c + locale-setup.h + manager.c + manager.h + mount.c + mount.h + namespace.c + namespace.h + path.c + path.h + scope.c + scope.h + selinux-access.c + selinux-access.h + selinux-setup.c + selinux-setup.h + service.c + service.h + show-status.c + show-status.h + slice.c + slice.h + smack-setup.c + smack-setup.h + socket.c + socket.h + swap.c + swap.h + target.c + target.h + timer.c + timer.h + transaction.c + transaction.h + unit-printf.c + unit-printf.h + unit.c + unit.h +'''.split() + +load_fragment_gperf_gperf = custom_target( + 'load-fragment-gperf.gperf', + input : 'load-fragment-gperf.gperf.m4', + output: 'load-fragment-gperf.gperf', + command : [meson_apply_m4, config_h, '@INPUT@'], + capture : true) + +load_fragment_gperf_c = custom_target( + 'load-fragment-gperf.c', + input : load_fragment_gperf_gperf, + output : 'load-fragment-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +awkscript = 'load-fragment-gperf-nulstr.awk' +load_fragment_gperf_nulstr_c = custom_target( + 'load-fragment-gperf-nulstr.c', + input : [awkscript, load_fragment_gperf_gperf], + output : 'load-fragment-gperf-nulstr.c', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +# A convenience library to share code with other binaries: +# systemd-shutdown, systemd-remount-fs, systemd-machine-id-setup, … +libcore_shared = static_library( + 'core-shared', + libcore_shared_sources, + include_directories : includes, + dependencies : [versiondep, + libmount]) + +libcore = static_library( + 'core', + libcore_sources, + load_fragment_gperf_c, + load_fragment_gperf_nulstr_c, + include_directories : includes, + link_whole : libcore_shared, + dependencies : [versiondep, + threads, + librt, + libseccomp, + libpam, + libaudit, + libkmod, + libapparmor, + libselinux, + libmount, + libacl]) + +systemd_sources = files('main.c') + +in_files = [['macros.systemd', rpmmacrosdir], + ['system.conf', pkgsysconfdir], + ['user.conf', pkgsysconfdir], + ['systemd.pc', pkgconfigdatadir], + ['triggers.systemd', '']] + +foreach item : in_files + file = item[0] + dir = item[1] + if install_sysconfdir or dir != pkgsysconfdir + configure_file( + input : file + '.in', + output : file, + configuration : substs, + install_dir : dir == 'no' ? '' : dir) + endif + +endforeach + +install_data('org.freedesktop.systemd1.conf', + install_dir : dbuspolicydir) +install_data('org.freedesktop.systemd1.service', + install_dir : dbussystemservicedir) + +policy = configure_file( + input : 'org.freedesktop.systemd1.policy.in', + output : 'org.freedesktop.systemd1.policy', + configuration : substs) +install_data(policy, + install_dir : polkitpolicydir) + +meson.add_install_script('sh', '-c', mkdir_p.format(systemshutdowndir)) +meson.add_install_script('sh', '-c', mkdir_p.format(systemsleepdir)) +meson.add_install_script('sh', '-c', mkdir_p.format(systemgeneratordir)) +meson.add_install_script('sh', '-c', mkdir_p.format(usergeneratordir)) + +if install_sysconfdir + meson.add_install_script('sh', '-c', mkdir_p.format(join_paths(pkgsysconfdir, 'system'))) + meson.add_install_script('sh', '-c', mkdir_p.format(join_paths(pkgsysconfdir, 'user'))) + meson.add_install_script('sh', '-c', mkdir_p.format(join_paths(sysconfdir, 'xdg/systemd'))) +endif diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c new file mode 100644 index 0000000..915b101 --- /dev/null +++ b/src/core/mount-setup.c @@ -0,0 +1,561 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <ftw.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/statvfs.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "conf-files.h" +#include "cgroup-setup.h" +#include "dev-setup.h" +#include "dirent-util.h" +#include "efi-loader.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "label.h" +#include "log.h" +#include "macro.h" +#include "mkdir.h" +#include "mount-setup.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "set.h" +#include "smack-util.h" +#include "strv.h" +#include "user-util.h" +#include "virt.h" + +typedef enum MountMode { + MNT_NONE = 0, + MNT_FATAL = 1 << 0, + MNT_IN_CONTAINER = 1 << 1, + MNT_CHECK_WRITABLE = 1 << 2, + MNT_FOLLOW_SYMLINK = 1 << 3, +} MountMode; + +typedef struct MountPoint { + const char *what; + const char *where; + const char *type; + const char *options; + unsigned long flags; + bool (*condition_fn)(void); + MountMode mode; +} MountPoint; + +/* The first three entries we might need before SELinux is up. The + * fourth (securityfs) is needed by IMA to load a custom policy. The + * other ones we can delay until SELinux and IMA are loaded. When + * SMACK is enabled we need smackfs, too, so it's a fifth one. */ +#if ENABLE_SMACK +#define N_EARLY_MOUNT 5 +#else +#define N_EARLY_MOUNT 4 +#endif + +static const MountPoint mount_table[] = { + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK }, + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "devtmpfs", "/dev", "devtmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_NOEXEC|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE }, +#if ENABLE_SMACK + { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, + mac_smack_use, MNT_FATAL }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + mac_smack_use, MNT_FATAL }, +#endif + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, + NULL, MNT_IN_CONTAINER }, +#if ENABLE_SMACK + { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + mac_smack_use, MNT_FATAL }, +#endif + { "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_legacy_wanted, MNT_IN_CONTAINER }, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, + { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE }, +#if ENABLE_EFI + { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + is_efi_boot, MNT_NONE }, +#endif + { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE, }, +}; + +bool mount_point_is_api(const char *path) { + unsigned i; + + /* Checks if this mount point is considered "API", and hence + * should be ignored */ + + for (i = 0; i < ELEMENTSOF(mount_table); i ++) + if (path_equal(path, mount_table[i].where)) + return true; + + return path_startswith(path, "/sys/fs/cgroup/"); +} + +bool mount_point_ignore(const char *path) { + + const char *i; + + /* These are API file systems that might be mounted by other software, we just list them here so that + * we know that we should ignore them. */ + FOREACH_STRING(i, + /* SELinux file systems */ + "/sys/fs/selinux", + /* Container bind mounts */ + "/dev/console", + "/proc/kmsg", + "/proc/sys", + "/proc/sys/kernel/random/boot_id") + if (path_equal(path, i)) + return true; + + if (path_startswith(path, "/run/host")) /* All mounts passed in from the container manager are + * something we better ignore. */ + return true; + + return false; +} + +static int mount_one(const MountPoint *p, bool relabel) { + int r, priority; + + assert(p); + + priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG; + + if (p->condition_fn && !p->condition_fn()) + return 0; + + /* Relabel first, just in case */ + if (relabel) + (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS); + + r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW); + if (r < 0 && r != -ENOENT) { + log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + if (r > 0) + return 0; + + /* Skip securityfs in a container */ + if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0) + return 0; + + /* The access mode here doesn't really matter too much, since + * the mounted file system will take precedence anyway. */ + if (relabel) + (void) mkdir_p_label(p->where, 0755); + else + (void) mkdir_p(p->where, 0755); + + log_debug("Mounting %s to %s of type %s with options %s.", + p->what, + p->where, + p->type, + strna(p->options)); + + if (FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK)) + r = mount(p->what, p->where, p->type, p->flags, p->options) < 0 ? -errno : 0; + else + r = mount_nofollow(p->what, p->where, p->type, p->flags, p->options); + if (r < 0) { + log_full_errno(priority, r, "Failed to mount %s at %s: %m", p->type, p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + + /* Relabel again, since we now mounted something fresh here */ + if (relabel) + (void) label_fix(p->where, 0); + + if (p->mode & MNT_CHECK_WRITABLE) { + if (access(p->where, W_OK) < 0) { + r = -errno; + + (void) umount2(p->where, UMOUNT_NOFOLLOW); + (void) rmdir(p->where); + + log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + } + + return 1; +} + +static int mount_points_setup(unsigned n, bool loaded_policy) { + unsigned i; + int r = 0; + + for (i = 0; i < n; i ++) { + int j; + + j = mount_one(mount_table + i, loaded_policy); + if (j != 0 && r >= 0) + r = j; + } + + return r; +} + +int mount_setup_early(void) { + assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table)); + + /* Do a minimal mount of /proc and friends to enable the most + * basic stuff, such as SELinux */ + return mount_points_setup(N_EARLY_MOUNT, false); +} + +static const char *join_with(const char *controller) { + + static const char* const pairs[] = { + "cpu", "cpuacct", + "net_cls", "net_prio", + NULL + }; + + const char *const *x, *const *y; + + assert(controller); + + /* This will lookup which controller to mount another controller with. Input is a controller name, and output + * is the other controller name. The function works both ways: you can input one and get the other, and input + * the other to get the one. */ + + STRV_FOREACH_PAIR(x, y, pairs) { + if (streq(controller, *x)) + return *y; + if (streq(controller, *y)) + return *x; + } + + return NULL; +} + +static int symlink_controller(const char *target, const char *alias) { + const char *a; + int r; + + assert(target); + assert(alias); + + a = strjoina("/sys/fs/cgroup/", alias); + + r = symlink_idempotent(target, a, false); + if (r < 0) + return log_error_errno(r, "Failed to create symlink %s: %m", a); + +#ifdef SMACK_RUN_LABEL + const char *p; + + p = strjoina("/sys/fs/cgroup/", target); + + r = mac_smack_copy(a, p); + if (r < 0 && r != -EOPNOTSUPP) + return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a); +#endif + + return 0; +} + +int mount_cgroup_controllers(void) { + _cleanup_set_free_free_ Set *controllers = NULL; + int r; + + if (!cg_is_legacy_wanted()) + return 0; + + /* Mount all available cgroup controllers that are built into the kernel. */ + r = cg_kernel_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to enumerate cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL; + const char *other_controller; + MountPoint p = { + .what = "cgroup", + .type = "cgroup", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_IN_CONTAINER, + }; + + controller = set_steal_first(controllers); + if (!controller) + break; + + /* Check if we shall mount this together with another controller */ + other_controller = join_with(controller); + if (other_controller) { + _cleanup_free_ char *c = NULL; + + /* Check if the other controller is actually available in the kernel too */ + c = set_remove(controllers, other_controller); + if (c) { + + /* Join the two controllers into one string, and maintain a stable ordering */ + if (strcmp(controller, other_controller) < 0) + options = strjoin(controller, ",", other_controller); + else + options = strjoin(other_controller, ",", controller); + if (!options) + return log_oom(); + } + } + + /* The simple case, where there's only one controller to mount together */ + if (!options) + options = TAKE_PTR(controller); + + where = path_join("/sys/fs/cgroup", options); + if (!where) + return log_oom(); + + p.where = where; + p.options = options; + + r = mount_one(&p, true); + if (r < 0) + return r; + + /* Create symlinks from the individual controller names, in case we have a joined mount */ + if (controller) + (void) symlink_controller(options, controller); + if (other_controller) + (void) symlink_controller(options, other_controller); + } + + /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */ + (void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP); + + return 0; +} + +#if HAVE_SELINUX || ENABLE_SMACK +static int nftw_cb( + const char *fpath, + const struct stat *sb, + int tflag, + struct FTW *ftwbuf) { + + /* No need to label /dev twice in a row... */ + if (_unlikely_(ftwbuf->level == 0)) + return FTW_CONTINUE; + + (void) label_fix(fpath, 0); + + /* /run/initramfs is static data and big, no need to + * dynamically relabel its contents at boot... */ + if (_unlikely_(ftwbuf->level == 1 && + tflag == FTW_D && + streq(fpath, "/run/initramfs"))) + return FTW_SKIP_SUBTREE; + + return FTW_CONTINUE; +}; + +static int relabel_cgroup_filesystems(void) { + int r; + struct statfs st; + + r = cg_all_unified(); + if (r == 0) { + /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this + only when the filesystem has been already populated by a previous instance of systemd + running from initrd. Otherwise don't remount anything and leave the filesystem read-write + for the cgroup filesystems to be mounted inside. */ + if (statfs("/sys/fs/cgroup", &st) < 0) + return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m"); + + if (st.f_flags & ST_RDONLY) + (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL); + + (void) label_fix("/sys/fs/cgroup", 0); + (void) nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL); + + if (st.f_flags & ST_RDONLY) + (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL); + + } else if (r < 0) + return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m"); + + return 0; +} + +static int relabel_extra(void) { + _cleanup_strv_free_ char **files = NULL; + char **file; + int r, c = 0; + + /* Support for relabelling additional files or directories after loading the policy. For this, code in the + * initrd simply has to drop in *.relabel files into /run/systemd/relabel-extra.d/. We'll read all such files + * expecting one absolute path by line and will relabel each (and everyone below that in case the path refers + * to a directory). These drop-in files are supposed to be absolutely minimal, and do not understand comments + * and such. After the operation succeeded the files are removed, and the drop-in directory as well, if + * possible. + */ + + r = conf_files_list(&files, ".relabel", NULL, + CONF_FILES_FILTER_MASKED | CONF_FILES_REGULAR, + "/run/systemd/relabel-extra.d/"); + if (r < 0) + return log_error_errno(r, "Failed to enumerate /run/systemd/relabel-extra.d/, ignoring: %m"); + + STRV_FOREACH(file, files) { + _cleanup_fclose_ FILE *f = NULL; + + f = fopen(*file, "re"); + if (!f) { + log_warning_errno(errno, "Failed to open %s, ignoring: %m", *file); + continue; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) { + log_warning_errno(r, "Failed to read %s, ignoring: %m", *file); + break; + } + if (r == 0) /* EOF */ + break; + + path_simplify(line, true); + + if (!path_is_normalized(line)) { + log_warning("Path to relabel is not normalized, ignoring: %s", line); + continue; + } + + if (!path_is_absolute(line)) { + log_warning("Path to relabel is not absolute, ignoring: %s", line); + continue; + } + + log_debug("Relabelling additional file/directory '%s'.", line); + (void) label_fix(line, 0); + (void) nftw(line, nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL); + c++; + } + + if (unlink(*file) < 0) + log_warning_errno(errno, "Failed to remove %s, ignoring: %m", *file); + } + + /* Remove when we complete things. */ + if (rmdir("/run/systemd/relabel-extra.d") < 0 && + errno != ENOENT) + log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/ directory: %m"); + + return c; +} +#endif + +int mount_setup(bool loaded_policy, bool leave_propagation) { + int r; + + r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy); + if (r < 0) + return r; + +#if HAVE_SELINUX || ENABLE_SMACK + /* Nodes in devtmpfs and /run need to be manually updated for + * the appropriate labels, after mounting. The other virtual + * API file systems like /sys and /proc do not need that, they + * use the same label for all their files. */ + if (loaded_policy) { + usec_t before_relabel, after_relabel; + char timespan[FORMAT_TIMESPAN_MAX]; + const char *i; + int n_extra; + + before_relabel = now(CLOCK_MONOTONIC); + + FOREACH_STRING(i, "/dev", "/dev/shm", "/run") + (void) nftw(i, nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL); + + (void) relabel_cgroup_filesystems(); + + n_extra = relabel_extra(); + + after_relabel = now(CLOCK_MONOTONIC); + + log_info("Relabelled /dev, /dev/shm, /run, /sys/fs/cgroup%s in %s.", + n_extra > 0 ? ", additional files" : "", + format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0)); + } +#endif + + /* Create a few default symlinks, which are normally created + * by udevd, but some scripts might need them before we start + * udevd. */ + dev_setup(NULL, UID_INVALID, GID_INVALID); + + /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we + * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of + * the box. If specific setups need other settings they can reset the propagation mode to private if + * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a + * container manager we assume the container manager knows what it is doing (for example, because it set up + * some directories with different propagation modes). */ + if (detect_container() <= 0 && !leave_propagation) + if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0) + log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m"); + + /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so + * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will + * misdetect systemd. */ + (void) mkdir_label("/run/systemd", 0755); + (void) mkdir_label("/run/systemd/system", 0755); + + /* Make sure we have a mount point to hide in sandboxes */ + (void) mkdir_label("/run/credentials", 0755); + + /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount + * inaccessible nodes from. If we run in a container the host might have created these for us already + * in /run/host/inaccessible/. Use those if we can, since that way we likely get access to block/char + * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a + * userns outside the container and thus nicely read-only and not remountable. */ + if (access("/run/host/inaccessible/", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m"); + + (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID); + } else + (void) symlink("../host/inaccessible", "/run/systemd/inaccessible"); + + return 0; +} diff --git a/src/core/mount-setup.h b/src/core/mount-setup.h new file mode 100644 index 0000000..29bd62f --- /dev/null +++ b/src/core/mount-setup.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +int mount_setup_early(void); +int mount_setup(bool loaded_policy, bool leave_propagation); + +int mount_cgroup_controllers(void); + +bool mount_point_is_api(const char *path); +bool mount_point_ignore(const char *path); diff --git a/src/core/mount.c b/src/core/mount.c new file mode 100644 index 0000000..8e83de0 --- /dev/null +++ b/src/core/mount.c @@ -0,0 +1,2204 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <sys/epoll.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "dbus-mount.h" +#include "dbus-unit.h" +#include "device.h" +#include "exit-status.h" +#include "format-util.h" +#include "fstab-util.h" +#include "libmount-util.h" +#include "log.h" +#include "manager.h" +#include "mkdir.h" +#include "mount-setup.h" +#include "mount.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +#define RETRY_UMOUNT_MAX 32 + +static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = { + [MOUNT_DEAD] = UNIT_INACTIVE, + [MOUNT_MOUNTING] = UNIT_ACTIVATING, + [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING, + [MOUNT_MOUNTED] = UNIT_ACTIVE, + [MOUNT_REMOUNTING] = UNIT_RELOADING, + [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING, + [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING, + [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING, + [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING, + [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING, + [MOUNT_FAILED] = UNIT_FAILED, + [MOUNT_CLEANING] = UNIT_MAINTENANCE, +}; + +static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int mount_process_proc_self_mountinfo(Manager *m); + +static bool MOUNT_STATE_WITH_PROCESS(MountState state) { + return IN_SET(state, + MOUNT_MOUNTING, + MOUNT_MOUNTING_DONE, + MOUNT_REMOUNTING, + MOUNT_REMOUNTING_SIGTERM, + MOUNT_REMOUNTING_SIGKILL, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL, + MOUNT_CLEANING); +} + +static bool mount_is_automount(const MountParameters *p) { + assert(p); + + return fstab_test_option(p->options, + "comment=systemd.automount\0" + "x-systemd.automount\0"); +} + +static bool mount_is_network(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "_netdev\0")) + return true; + + if (p->fstype && fstype_is_network(p->fstype)) + return true; + + return false; +} + +static bool mount_is_nofail(const Mount *m) { + assert(m); + + if (!m->from_fragment) + return false; + + return fstab_test_yes_no_option(m->parameters_fragment.options, "nofail\0" "fail\0"); +} + +static bool mount_is_loop(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "loop\0")) + return true; + + return false; +} + +static bool mount_is_bind(const MountParameters *p) { + assert(p); + + if (fstab_test_option(p->options, "bind\0" "rbind\0")) + return true; + + if (p->fstype && STR_IN_SET(p->fstype, "bind", "rbind")) + return true; + + return false; +} + +static bool mount_is_bound_to_device(const Mount *m) { + const MountParameters *p; + + if (m->from_fragment) + return true; + + p = &m->parameters_proc_self_mountinfo; + return fstab_test_option(p->options, "x-systemd.device-bound\0"); +} + +static bool mount_needs_quota(const MountParameters *p) { + assert(p); + + /* Quotas are not enabled on network filesystems, but we want them, for example, on storage connected via + * iscsi. We hence don't use mount_is_network() here, as that would also return true for _netdev devices. */ + if (p->fstype && fstype_is_network(p->fstype)) + return false; + + if (mount_is_bind(p)) + return false; + + return fstab_test_option(p->options, + "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0"); +} + +static void mount_init(Unit *u) { + Mount *m = MOUNT(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + m->timeout_usec = u->manager->default_timeout_start_usec; + + m->exec_context.std_output = u->manager->default_std_output; + m->exec_context.std_error = u->manager->default_std_error; + + m->directory_mode = 0755; + + /* We need to make sure that /usr/bin/mount is always called + * in the same process group as us, so that the autofs kernel + * side doesn't send us another mount request while we are + * already trying to comply its last one. */ + m->exec_context.same_pgrp = true; + + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + + u->ignore_on_isolate = true; +} + +static int mount_arm_timer(Mount *m, usec_t usec) { + int r; + + assert(m); + + if (m->timer_event_source) { + r = sd_event_source_set_time(m->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(m->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(m)->manager->event, + &m->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + mount_dispatch_timer, m); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->timer_event_source, "mount-timer"); + + return 0; +} + +static void mount_unwatch_control_pid(Mount *m) { + assert(m); + + if (m->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(m), m->control_pid); + m->control_pid = 0; +} + +static void mount_parameters_done(MountParameters *p) { + assert(p); + + p->what = mfree(p->what); + p->options = mfree(p->options); + p->fstype = mfree(p->fstype); +} + +static void mount_done(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + m->where = mfree(m->where); + + mount_parameters_done(&m->parameters_proc_self_mountinfo); + mount_parameters_done(&m->parameters_fragment); + + m->exec_runtime = exec_runtime_unref(m->exec_runtime, false); + exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); + m->control_command = NULL; + + dynamic_creds_unref(&m->dynamic_creds); + + mount_unwatch_control_pid(m); + + m->timer_event_source = sd_event_source_unref(m->timer_event_source); +} + +static MountParameters* get_mount_parameters_fragment(Mount *m) { + assert(m); + + if (m->from_fragment) + return &m->parameters_fragment; + + return NULL; +} + +static MountParameters* get_mount_parameters(Mount *m) { + assert(m); + + if (m->from_proc_self_mountinfo) + return &m->parameters_proc_self_mountinfo; + + return get_mount_parameters_fragment(m); +} + +static int update_parameters_proc_self_mountinfo( + Mount *m, + const char *what, + const char *options, + const char *fstype) { + + MountParameters *p; + int r, q, w; + + p = &m->parameters_proc_self_mountinfo; + + r = free_and_strdup(&p->what, what); + if (r < 0) + return r; + + q = free_and_strdup(&p->options, options); + if (q < 0) + return q; + + w = free_and_strdup(&p->fstype, fstype); + if (w < 0) + return w; + + return r > 0 || q > 0 || w > 0; +} + +static int mount_add_mount_dependencies(Mount *m) { + MountParameters *pm; + Unit *other; + Set *s; + int r; + + assert(m); + + if (!path_equal(m->where, "/")) { + _cleanup_free_ char *parent = NULL; + + /* Adds in links to other mount points that might lie further up in the hierarchy */ + + parent = dirname_malloc(m->where); + if (!parent) + return -ENOMEM; + + r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + } + + /* Adds in dependencies to other mount points that might be needed for the source path (if this is a bind mount + * or a loop mount) to be available. */ + pm = get_mount_parameters_fragment(m); + if (pm && pm->what && + path_is_absolute(pm->what) && + (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) { + + r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */ + s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where); + SET_FOREACH(other, s) { + + if (other->load_state != UNIT_LOADED) + continue; + + if (other == UNIT(m)) + continue; + + r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + + if (UNIT(m)->fragment_path) { + /* If we have fragment configuration, then make this dependency required */ + r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + } + } + + return 0; +} + +static int mount_add_device_dependencies(Mount *m) { + UnitDependencyMask mask; + MountParameters *p; + UnitDependency dep; + int r; + + assert(m); + + p = get_mount_parameters(m); + if (!p) + return 0; + + if (!p->what) + return 0; + + if (mount_is_bind(p)) + return 0; + + if (!is_device_path(p->what)) + return 0; + + /* /dev/root is a really weird thing, it's not a real device, but just a path the kernel exports for + * the root file system specified on the kernel command line. Ignore it here. */ + if (PATH_IN_SET(p->what, "/dev/root", "/dev/nfs")) + return 0; + + if (path_equal(m->where, "/")) + return 0; + + /* Mount units from /proc/self/mountinfo are not bound to devices by default since they're subject to + * races when devices are unplugged. But the user can still force this dep with an appropriate option + * (or udev property) so the mount units are automatically stopped when the device disappears + * suddenly. */ + dep = mount_is_bound_to_device(m) ? UNIT_BINDS_TO : UNIT_REQUIRES; + + /* We always use 'what' from /proc/self/mountinfo if mounted */ + mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT : UNIT_DEPENDENCY_FILE; + + r = unit_add_node_dependency(UNIT(m), p->what, dep, mask); + if (r < 0) + return r; + + return unit_add_blockdev_dependency(UNIT(m), p->what, mask); +} + +static int mount_add_quota_dependencies(Mount *m) { + UnitDependencyMask mask; + MountParameters *p; + int r; + + assert(m); + + if (!MANAGER_IS_SYSTEM(UNIT(m)->manager)) + return 0; + + p = get_mount_parameters_fragment(m); + if (!p) + return 0; + + if (!mount_needs_quota(p)) + return 0; + + mask = m->from_fragment ? UNIT_DEPENDENCY_FILE : UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE, true, mask); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE, true, mask); + if (r < 0) + return r; + + return 0; +} + +static bool mount_is_extrinsic(Unit *u) { + MountParameters *p; + Mount *m = MOUNT(u); + assert(m); + + /* Returns true for all units that are "magic" and should be excluded from the usual + * start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally + * mounted outside of the systemd dependency logic. We shouldn't attempt to manage them + * ourselves but it's fine if the user operates on them with us. */ + + /* We only automatically manage mounts if we are in system mode */ + if (MANAGER_IS_USER(u->manager)) + return true; + + p = get_mount_parameters(m); + if (p && fstab_is_extrinsic(m->where, p->options)) + return true; + + return false; +} + +static int mount_add_default_ordering_dependencies( + Mount *m, + MountParameters *p, + UnitDependencyMask mask) { + + const char *after, *before, *e; + int r; + + assert(m); + + e = path_startswith(m->where, "/sysroot"); + if (e && in_initrd()) { + /* All mounts under /sysroot need to happen later, at initrd-fs.target time. IOW, + * it's not technically part of the basic initrd filesystem itself, and so + * shouldn't inherit the default Before=local-fs.target dependency. */ + + after = NULL; + before = isempty(e) ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_INITRD_FS_TARGET; + + } else if (mount_is_network(p)) { + after = SPECIAL_REMOTE_FS_PRE_TARGET; + before = SPECIAL_REMOTE_FS_TARGET; + + } else { + after = SPECIAL_LOCAL_FS_PRE_TARGET; + before = SPECIAL_LOCAL_FS_TARGET; + } + + if (!mount_is_nofail(m) && !mount_is_automount(p)) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, true, mask); + if (r < 0) + return r; + } + + if (after) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, true, mask); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_UMOUNT_TARGET, true, mask); +} + +static int mount_add_default_dependencies(Mount *m) { + UnitDependencyMask mask; + MountParameters *p; + int r; + + assert(m); + + if (!UNIT(m)->default_dependencies) + return 0; + + /* We do not add any default dependencies to /, /usr or /run/initramfs/, since they are + * guaranteed to stay mounted the whole time, since our system is on it. Also, don't + * bother with anything mounted below virtual file systems, it's also going to be virtual, + * and hence not worth the effort. */ + if (mount_is_extrinsic(UNIT(m))) + return 0; + + p = get_mount_parameters(m); + if (!p) + return 0; + + mask = m->from_fragment ? UNIT_DEPENDENCY_FILE : UNIT_DEPENDENCY_MOUNTINFO_DEFAULT; + + r = mount_add_default_ordering_dependencies(m, p, mask); + if (r < 0) + return r; + + if (mount_is_network(p)) { + /* We order ourselves after network.target. This is primarily useful at shutdown: + * services that take down the network should order themselves before + * network.target, so that they are shut down only after this mount unit is + * stopped. */ + + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET, true, mask); + if (r < 0) + return r; + + /* We pull in network-online.target, and order ourselves after it. This is useful + * at start-up to actively pull in tools that want to be started before we start + * mounting network file systems, and whose purpose it is to delay this until the + * network is "up". */ + + r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET, true, mask); + if (r < 0) + return r; + } + + /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */ + if (streq_ptr(p->fstype, "tmpfs")) { + r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET, true, mask); + if (r < 0) + return r; + } + + return 0; +} + +static int mount_verify(Mount *m) { + _cleanup_free_ char *e = NULL; + MountParameters *p; + int r; + + assert(m); + assert(UNIT(m)->load_state == UNIT_LOADED); + + if (!m->from_fragment && !m->from_proc_self_mountinfo && !UNIT(m)->perpetual) + return -ENOENT; + + r = unit_name_from_path(m->where, ".mount", &e); + if (r < 0) + return log_unit_error_errno(UNIT(m), r, "Failed to generate unit name from mount path: %m"); + + if (!unit_has_name(UNIT(m), e)) { + log_unit_error(UNIT(m), "Where= setting doesn't match unit name. Refusing."); + return -ENOEXEC; + } + + if (mount_point_is_api(m->where) || mount_point_ignore(m->where)) { + log_unit_error(UNIT(m), "Cannot create mount unit for API file system %s. Refusing.", m->where); + return -ENOEXEC; + } + + p = get_mount_parameters_fragment(m); + if (p && !p->what && !UNIT(m)->perpetual) + return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), + "What= setting is missing. Refusing."); + + if (m->exec_context.pam_name && m->kill_context.kill_mode != KILL_CONTROL_GROUP) { + log_unit_error(UNIT(m), "Unit has PAM enabled. Kill mode must be set to control-group'. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int mount_add_non_exec_dependencies(Mount *m) { + int r; + assert(m); + + /* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies + * resulting from the ExecContext and such. */ + + r = mount_add_device_dependencies(m); + if (r < 0) + return r; + + r = mount_add_mount_dependencies(m); + if (r < 0) + return r; + + r = mount_add_quota_dependencies(m); + if (r < 0) + return r; + + r = mount_add_default_dependencies(m); + if (r < 0) + return r; + + return 0; +} + +static int mount_add_extras(Mount *m) { + Unit *u = UNIT(m); + int r; + + assert(m); + + /* Note: this call might be called after we already have been loaded once (and even when it has already been + * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready + * to run with an already set up unit. */ + + if (u->fragment_path) + m->from_fragment = true; + + if (!m->where) { + r = unit_name_to_path(u->id, &m->where); + if (r < 0) + return r; + } + + path_simplify(m->where, false); + + if (!u->description) { + r = unit_set_description(u, m->where); + if (r < 0) + return r; + } + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(u, &m->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = mount_add_non_exec_dependencies(m); + if (r < 0) + return r; + + return 0; +} + +static void mount_load_root_mount(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_ROOT_MOUNT)) + return; + + u->perpetual = true; + u->default_dependencies = false; + + /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */ + MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL; + MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL; + + if (!u->description) + u->description = strdup("Root Mount"); +} + +static int mount_load(Unit *u) { + Mount *m = MOUNT(u); + int r, q = 0; + + assert(u); + assert(u->load_state == UNIT_STUB); + + mount_load_root_mount(u); + + bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual; + r = unit_load_fragment_and_dropin(u, !fragment_optional); + + /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the + * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus + * we need to update the state in our unit to track it. After all, consider that we don't allow changing the + * 'slice' field for a unit once it is active. */ + if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual) + q = mount_add_extras(m); + + if (r < 0) + return r; + if (q < 0) + return q; + if (u->load_state != UNIT_LOADED) + return 0; + + return mount_verify(m); +} + +static void mount_set_state(Mount *m, MountState state) { + MountState old_state; + assert(m); + + if (m->state != state) + bus_unit_send_pending_change_signal(UNIT(m), false); + + old_state = m->state; + m->state = state; + + if (!MOUNT_STATE_WITH_PROCESS(state)) { + m->timer_event_source = sd_event_source_unref(m->timer_event_source); + mount_unwatch_control_pid(m); + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + } + + if (state != old_state) + log_unit_debug(UNIT(m), "Changed %s -> %s", mount_state_to_string(old_state), mount_state_to_string(state)); + + unit_notify(UNIT(m), state_translation_table[old_state], state_translation_table[state], + m->reload_result == MOUNT_SUCCESS ? 0 : UNIT_NOTIFY_RELOAD_FAILURE); +} + +static int mount_coldplug(Unit *u) { + Mount *m = MOUNT(u); + MountState new_state = MOUNT_DEAD; + int r; + + assert(m); + assert(m->state == MOUNT_DEAD); + + if (m->deserialized_state != m->state) + new_state = m->deserialized_state; + else if (m->from_proc_self_mountinfo) + new_state = MOUNT_MOUNTED; + + if (new_state == m->state) + return 0; + + if (m->control_pid > 0 && + pid_is_unwaited(m->control_pid) && + MOUNT_STATE_WITH_PROCESS(new_state)) { + + r = unit_watch_pid(UNIT(m), m->control_pid, false); + if (r < 0) + return r; + + r = mount_arm_timer(m, usec_add(u->state_change_timestamp.monotonic, m->timeout_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(new_state, MOUNT_DEAD, MOUNT_FAILED)) { + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + mount_set_state(m, new_state); + return 0; +} + +static void mount_dump(Unit *u, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESPAN_MAX]; + Mount *m = MOUNT(u); + MountParameters *p; + + assert(m); + assert(f); + + p = get_mount_parameters(m); + + fprintf(f, + "%sMount State: %s\n" + "%sResult: %s\n" + "%sClean Result: %s\n" + "%sWhere: %s\n" + "%sWhat: %s\n" + "%sFile System Type: %s\n" + "%sOptions: %s\n" + "%sFrom /proc/self/mountinfo: %s\n" + "%sFrom fragment: %s\n" + "%sExtrinsic: %s\n" + "%sDirectoryMode: %04o\n" + "%sSloppyOptions: %s\n" + "%sLazyUnmount: %s\n" + "%sForceUnmount: %s\n" + "%sReadWriteOnly: %s\n" + "%sTimeoutSec: %s\n", + prefix, mount_state_to_string(m->state), + prefix, mount_result_to_string(m->result), + prefix, mount_result_to_string(m->clean_result), + prefix, m->where, + prefix, p ? strna(p->what) : "n/a", + prefix, p ? strna(p->fstype) : "n/a", + prefix, p ? strna(p->options) : "n/a", + prefix, yes_no(m->from_proc_self_mountinfo), + prefix, yes_no(m->from_fragment), + prefix, yes_no(mount_is_extrinsic(u)), + prefix, m->directory_mode, + prefix, yes_no(m->sloppy_options), + prefix, yes_no(m->lazy_unmount), + prefix, yes_no(m->force_unmount), + prefix, yes_no(m->read_write_only), + prefix, format_timespan(buf, sizeof(buf), m->timeout_usec, USEC_PER_SEC)); + + if (m->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, m->control_pid); + + exec_context_dump(&m->exec_context, f, prefix); + kill_context_dump(&m->kill_context, f, prefix); + cgroup_context_dump(UNIT(m), f, prefix); +} + +static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + pid_t pid; + int r; + + assert(m); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(m)); + if (r < 0) + return r; + + r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec)); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(m), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(m), + c, + &m->exec_context, + &exec_params, + m->exec_runtime, + &m->dynamic_creds, + &pid); + if (r < 0) + return r; + + r = unit_watch_pid(UNIT(m), pid, true); + if (r < 0) + return r; + + *_pid = pid; + + return 0; +} + +static void mount_enter_dead(Mount *m, MountResult f) { + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result)); + unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_stop); + + mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD); + + m->exec_runtime = exec_runtime_unref(m->exec_runtime, true); + + unit_destroy_runtime_data(UNIT(m), &m->exec_context); + + unit_unref_uid_gid(UNIT(m), true); + + dynamic_creds_destroy(&m->dynamic_creds); + + /* Any dependencies based on /proc/self/mountinfo are now stale */ + unit_remove_dependencies(UNIT(m), UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT); +} + +static void mount_enter_mounted(Mount *m, MountResult f) { + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + mount_set_state(m, MOUNT_MOUNTED); +} + +static void mount_enter_dead_or_mounted(Mount *m, MountResult f) { + assert(m); + + /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this + * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all + * ultimately we just mirror the kernel's internal state on this. */ + + if (m->from_proc_self_mountinfo) + mount_enter_mounted(m, f); + else + mount_enter_dead(m, f); +} + +static int state_to_kill_operation(MountState state) { + switch (state) { + + case MOUNT_REMOUNTING_SIGTERM: + return KILL_RESTART; + + case MOUNT_UNMOUNTING_SIGTERM: + return KILL_TERMINATE; + + case MOUNT_REMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGKILL: + return KILL_KILL; + + default: + return _KILL_OPERATION_INVALID; + } +} + +static void mount_enter_signal(Mount *m, MountState state, MountResult f) { + int r; + + assert(m); + + if (m->result == MOUNT_SUCCESS) + m->result = f; + + r = unit_kill_context( + UNIT(m), + &m->kill_context, + state_to_kill_operation(state), + -1, + m->control_pid, + false); + if (r < 0) + goto fail; + + if (r > 0) { + r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec)); + if (r < 0) + goto fail; + + mount_set_state(m, state); + } else if (state == MOUNT_REMOUNTING_SIGTERM && m->kill_context.send_sigkill) + mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); + else if (IN_SET(state, MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL)) + mount_enter_mounted(m, MOUNT_SUCCESS); + else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill) + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS); + else + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m"); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_enter_unmounting(Mount *m) { + int r; + + assert(m); + + /* Start counting our attempts */ + if (!IN_SET(m->state, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL)) + m->n_retry_umount = 0; + + m->control_command_id = MOUNT_EXEC_UNMOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_UNMOUNT; + + r = exec_command_set(m->control_command, UMOUNT_PATH, m->where, "-c", NULL); + if (r >= 0 && m->lazy_unmount) + r = exec_command_append(m->control_command, "-l", NULL); + if (r >= 0 && m->force_unmount) + r = exec_command_append(m->control_command, "-f", NULL); + if (r < 0) + goto fail; + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_UNMOUNTING); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to run 'umount' task: %m"); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_enter_mounting(Mount *m) { + int r; + MountParameters *p; + + assert(m); + + r = unit_fail_if_noncanonical(UNIT(m), m->where); + if (r < 0) + goto fail; + + (void) mkdir_p_label(m->where, m->directory_mode); + + unit_warn_if_dir_nonempty(UNIT(m), m->where); + unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start); + + m->control_command_id = MOUNT_EXEC_MOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_MOUNT; + + /* Create the source directory for bind-mounts if needed */ + p = get_mount_parameters_fragment(m); + if (p && mount_is_bind(p)) { + r = mkdir_p_label(p->what, m->directory_mode); + if (r < 0) + log_unit_error_errno(UNIT(m), r, "Failed to make bind mount source '%s': %m", p->what); + } + + if (p) { + _cleanup_free_ char *opts = NULL; + + r = fstab_filter_options(p->options, "nofail\0" "noauto\0" "auto\0", NULL, NULL, &opts); + if (r < 0) + goto fail; + + r = exec_command_set(m->control_command, MOUNT_PATH, p->what, m->where, NULL); + if (r >= 0 && m->sloppy_options) + r = exec_command_append(m->control_command, "-s", NULL); + if (r >= 0 && m->read_write_only) + r = exec_command_append(m->control_command, "-w", NULL); + if (r >= 0 && p->fstype) + r = exec_command_append(m->control_command, "-t", p->fstype, NULL); + if (r >= 0 && !isempty(opts)) + r = exec_command_append(m->control_command, "-o", opts, NULL); + } else + r = -ENOENT; + if (r < 0) + goto fail; + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_MOUNTING); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to run 'mount' task: %m"); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES); +} + +static void mount_set_reload_result(Mount *m, MountResult result) { + assert(m); + + /* Only store the first error we encounter */ + if (m->reload_result != MOUNT_SUCCESS) + return; + + m->reload_result = result; +} + +static void mount_enter_remounting(Mount *m) { + int r; + MountParameters *p; + + assert(m); + + /* Reset reload result when we are about to start a new remount operation */ + m->reload_result = MOUNT_SUCCESS; + + m->control_command_id = MOUNT_EXEC_REMOUNT; + m->control_command = m->exec_command + MOUNT_EXEC_REMOUNT; + + p = get_mount_parameters_fragment(m); + if (p) { + const char *o; + + if (p->options) + o = strjoina("remount,", p->options); + else + o = "remount"; + + r = exec_command_set(m->control_command, MOUNT_PATH, + p->what, m->where, + "-o", o, NULL); + if (r >= 0 && m->sloppy_options) + r = exec_command_append(m->control_command, "-s", NULL); + if (r >= 0 && m->read_write_only) + r = exec_command_append(m->control_command, "-w", NULL); + if (r >= 0 && p->fstype) + r = exec_command_append(m->control_command, "-t", p->fstype, NULL); + } else + r = -ENOENT; + if (r < 0) + goto fail; + + mount_unwatch_control_pid(m); + + r = mount_spawn(m, m->control_command, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_REMOUNTING); + + return; + +fail: + log_unit_warning_errno(UNIT(m), r, "Failed to run 'remount' task: %m"); + mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); +} + +static void mount_cycle_clear(Mount *m) { + assert(m); + + /* Clear all state we shall forget for this new cycle */ + + m->result = MOUNT_SUCCESS; + m->reload_result = MOUNT_SUCCESS; + exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); + UNIT(m)->reset_accounting = true; +} + +static int mount_start(Unit *u) { + Mount *m = MOUNT(u); + int r; + + assert(m); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(m->state, + MOUNT_UNMOUNTING, + MOUNT_UNMOUNTING_SIGTERM, + MOUNT_UNMOUNTING_SIGKILL, + MOUNT_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(m->state, MOUNT_MOUNTING, MOUNT_MOUNTING_DONE)) + return 0; + + assert(IN_SET(m->state, MOUNT_DEAD, MOUNT_FAILED)); + + r = unit_test_start_limit(u); + if (r < 0) { + mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + mount_cycle_clear(m); + mount_enter_mounting(m); + + return 1; +} + +static int mount_stop(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + switch (m->state) { + + case MOUNT_UNMOUNTING: + case MOUNT_UNMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGTERM: + /* Already on it */ + return 0; + + case MOUNT_MOUNTING: + case MOUNT_MOUNTING_DONE: + case MOUNT_REMOUNTING: + /* If we are still waiting for /bin/mount, we go directly into kill mode. */ + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_SUCCESS); + return 0; + + case MOUNT_REMOUNTING_SIGTERM: + /* If we are already waiting for a hung remount, convert this to the matching unmounting state */ + mount_set_state(m, MOUNT_UNMOUNTING_SIGTERM); + return 0; + + case MOUNT_REMOUNTING_SIGKILL: + /* as above */ + mount_set_state(m, MOUNT_UNMOUNTING_SIGKILL); + return 0; + + case MOUNT_MOUNTED: + mount_enter_unmounting(m); + return 1; + + case MOUNT_CLEANING: + /* If we are currently cleaning, then abort it, brutally. */ + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS); + return 0; + + default: + assert_not_reached("Unexpected state."); + } +} + +static int mount_reload(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + assert(m->state == MOUNT_MOUNTED); + + mount_enter_remounting(m); + + return 1; +} + +static int mount_serialize(Unit *u, FILE *f, FDSet *fds) { + Mount *m = MOUNT(u); + + assert(m); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", mount_state_to_string(m->state)); + (void) serialize_item(f, "result", mount_result_to_string(m->result)); + (void) serialize_item(f, "reload-result", mount_result_to_string(m->reload_result)); + (void) serialize_item_format(f, "n-retry-umount", "%u", m->n_retry_umount); + + if (m->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, m->control_pid); + + if (m->control_command_id >= 0) + (void) serialize_item(f, "control-command", mount_exec_command_to_string(m->control_command_id)); + + return 0; +} + +static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Mount *m = MOUNT(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + MountState state; + + if ((state = mount_state_from_string(value)) < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + m->deserialized_state = state; + + } else if (streq(key, "result")) { + MountResult f; + + f = mount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != MOUNT_SUCCESS) + m->result = f; + + } else if (streq(key, "reload-result")) { + MountResult f; + + f = mount_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse reload result value: %s", value); + else if (f != MOUNT_SUCCESS) + m->reload_result = f; + + } else if (streq(key, "n-retry-umount")) { + + r = safe_atou(value, &m->n_retry_umount); + if (r < 0) + log_unit_debug(u, "Failed to parse n-retry-umount value: %s", value); + + } else if (streq(key, "control-pid")) { + + if (parse_pid(value, &m->control_pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + + } else if (streq(key, "control-command")) { + MountExecCommand id; + + id = mount_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + m->control_command_id = id; + m->control_command = m->exec_command + id; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState mount_active_state(Unit *u) { + assert(u); + + return state_translation_table[MOUNT(u)->state]; +} + +_pure_ static const char *mount_sub_state_to_string(Unit *u) { + assert(u); + + return mount_state_to_string(MOUNT(u)->state); +} + +_pure_ static bool mount_may_gc(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + if (m->from_proc_self_mountinfo) + return false; + + return true; +} + +static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Mount *m = MOUNT(u); + MountResult f; + + assert(m); + assert(pid >= 0); + + if (pid != m->control_pid) + return; + + /* So here's the thing, we really want to know before /usr/bin/mount or /usr/bin/umount exit whether + * they established/remove a mount. This is important when mounting, but even more so when unmounting + * since we need to deal with nested mounts and otherwise cannot safely determine whether to repeat + * the unmounts. In theory, the kernel fires /proc/self/mountinfo changes off before returning from + * the mount() or umount() syscalls, and thus we should see the changes to the proc file before we + * process the waitid() for the /usr/bin/(u)mount processes. However, this is unfortunately racy: we + * have to waitid() for processes using P_ALL (since we need to reap unexpected children that got + * reparented to PID 1), but when using P_ALL we might end up reaping processes that terminated just + * instants ago, i.e. already after our last event loop iteration (i.e. after the last point we might + * have noticed /proc/self/mountinfo events via epoll). This means event loop priorities for + * processing SIGCHLD vs. /proc/self/mountinfo IO events are not as relevant as we want. To fix that + * race, let's explicitly scan /proc/self/mountinfo before we start processing /usr/bin/(u)mount + * dying. It's ugly, but it makes our ordering systematic again, and makes sure we always see + * /proc/self/mountinfo changes before our mount/umount exits. */ + (void) mount_process_proc_self_mountinfo(u->manager); + + m->control_pid = 0; + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = MOUNT_SUCCESS; + else if (code == CLD_EXITED) + f = MOUNT_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = MOUNT_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = MOUNT_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown code"); + + if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM)) + mount_set_reload_result(m, f); + else if (m->result == MOUNT_SUCCESS) + m->result = f; + + if (m->control_command) { + exec_status_exit(&m->control_command->exec_status, &m->exec_context, pid, code, status); + + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + } + + unit_log_process_exit( + u, + "Mount process", + mount_exec_command_to_string(m->control_command_id), + f == MOUNT_SUCCESS, + code, status); + + /* Note that due to the io event priority logic, we can be sure the new mountinfo is loaded + * before we process the SIGCHLD for the mount command. */ + + switch (m->state) { + + case MOUNT_MOUNTING: + /* Our mount point has not appeared in mountinfo. Something went wrong. */ + + if (f == MOUNT_SUCCESS) { + /* Either /bin/mount has an unexpected definition of success, + * or someone raced us and we lost. */ + log_unit_warning(UNIT(m), "Mount process finished, but there is no mount."); + f = MOUNT_FAILURE_PROTOCOL; + } + mount_enter_dead(m, f); + break; + + case MOUNT_MOUNTING_DONE: + mount_enter_mounted(m, f); + break; + + case MOUNT_REMOUNTING: + case MOUNT_REMOUNTING_SIGTERM: + case MOUNT_REMOUNTING_SIGKILL: + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + break; + + case MOUNT_UNMOUNTING: + + if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) { + + /* Still a mount point? If so, let's try again. Most likely there were multiple mount points + * stacked on top of each other. We might exceed the timeout specified by the user overall, + * but we will stop as soon as any one umount times out. */ + + if (m->n_retry_umount < RETRY_UMOUNT_MAX) { + log_unit_debug(u, "Mount still present, trying again."); + m->n_retry_umount++; + mount_enter_unmounting(m); + } else { + log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount); + mount_enter_mounted(m, f); + } + } else + mount_enter_dead_or_mounted(m, f); + + break; + + case MOUNT_UNMOUNTING_SIGKILL: + case MOUNT_UNMOUNTING_SIGTERM: + mount_enter_dead_or_mounted(m, f); + break; + + case MOUNT_CLEANING: + if (m->clean_result == MOUNT_SUCCESS) + m->clean_result = f; + + mount_enter_dead(m, MOUNT_SUCCESS); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Mount *m = MOUNT(userdata); + + assert(m); + assert(m->timer_event_source == source); + + switch (m->state) { + + case MOUNT_MOUNTING: + case MOUNT_MOUNTING_DONE: + log_unit_warning(UNIT(m), "Mounting timed out. Terminating."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_REMOUNTING: + log_unit_warning(UNIT(m), "Remounting timed out. Terminating remount process."); + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + mount_enter_signal(m, MOUNT_REMOUNTING_SIGTERM, MOUNT_SUCCESS); + break; + + case MOUNT_REMOUNTING_SIGTERM: + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + + if (m->kill_context.send_sigkill) { + log_unit_warning(UNIT(m), "Remounting timed out. Killing."); + mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS); + } else { + log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + } + break; + + case MOUNT_REMOUNTING_SIGKILL: + mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT); + + log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_SUCCESS); + break; + + case MOUNT_UNMOUNTING: + log_unit_warning(UNIT(m), "Unmounting timed out. Terminating."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_UNMOUNTING_SIGTERM: + if (m->kill_context.send_sigkill) { + log_unit_warning(UNIT(m), "Mount process timed out. Killing."); + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + } + break; + + case MOUNT_UNMOUNTING_SIGKILL: + log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring."); + mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT); + break; + + case MOUNT_CLEANING: + log_unit_warning(UNIT(m), "Cleaning timed out. killing."); + + if (m->clean_result == MOUNT_SUCCESS) + m->clean_result = MOUNT_FAILURE_TIMEOUT; + + mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, 0); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +static int mount_setup_new_unit( + Manager *m, + const char *name, + const char *what, + const char *where, + const char *options, + const char *fstype, + MountProcFlags *ret_flags, + Unit **ret) { + + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + assert(m); + assert(name); + assert(ret_flags); + assert(ret); + + r = unit_new_for_name(m, sizeof(Mount), name, &u); + if (r < 0) + return r; + + r = free_and_strdup(&u->source_path, "/proc/self/mountinfo"); + if (r < 0) + return r; + + r = free_and_strdup(&MOUNT(u)->where, where); + if (r < 0) + return r; + + r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype); + if (r < 0) + return r; + + /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the time we load + * the unit file for it (and thus add in extra deps right after) we know what source to attributes the deps + * to.*/ + MOUNT(u)->from_proc_self_mountinfo = true; + + /* We have only allocated the stub now, let's enqueue this unit for loading now, so that everything else is + * loaded in now. */ + unit_add_to_load_queue(u); + + *ret_flags = MOUNT_PROC_IS_MOUNTED | MOUNT_PROC_JUST_MOUNTED | MOUNT_PROC_JUST_CHANGED; + *ret = TAKE_PTR(u); + return 0; +} + +static int mount_setup_existing_unit( + Unit *u, + const char *what, + const char *where, + const char *options, + const char *fstype, + MountProcFlags *ret_flags) { + + int r; + + assert(u); + assert(ret_flags); + + if (!MOUNT(u)->where) { + MOUNT(u)->where = strdup(where); + if (!MOUNT(u)->where) + return -ENOMEM; + } + + /* In case we have multiple mounts established on the same mount point, let's merge flags set already + * for the current unit. Note that the flags field is reset on each iteration of reading + * /proc/self/mountinfo, hence we know for sure anything already set here is from the current + * iteration and thus worthy of taking into account. */ + MountProcFlags flags = + MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED; + + r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype); + if (r < 0) + return r; + if (r > 0) + flags |= MOUNT_PROC_JUST_CHANGED; + + /* There are two conditions when we consider a mount point just mounted: when we haven't seen it in + * /proc/self/mountinfo before or when MOUNT_MOUNTING is our current state. Why bother with the + * latter? Shouldn't that be covered by the former? No, during reload it is not because we might then + * encounter a new /proc/self/mountinfo in combination with an old mount unit state (since it stems + * from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is + * reached when we wait for the mount to appear we hence can assume that if we are in it, we are + * actually seeing it established for the first time. */ + if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING) + flags |= MOUNT_PROC_JUST_MOUNTED; + + MOUNT(u)->from_proc_self_mountinfo = true; + + if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in + * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */ + u->load_state = UNIT_LOADED; + u->load_error = 0; + + flags |= MOUNT_PROC_JUST_CHANGED; + } + + if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) { + /* If things changed, then make sure that all deps are regenerated. Let's + * first remove all automatic deps, and then add in the new ones. */ + + unit_remove_dependencies(u, UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT); + + r = mount_add_non_exec_dependencies(MOUNT(u)); + if (r < 0) + return r; + } + + *ret_flags = flags; + return 0; +} + +static int mount_setup_unit( + Manager *m, + const char *what, + const char *where, + const char *options, + const char *fstype, + bool set_flags) { + + _cleanup_free_ char *e = NULL; + MountProcFlags flags; + Unit *u; + int r; + + assert(m); + assert(what); + assert(where); + assert(options); + assert(fstype); + + /* Ignore API mount points. They should never be referenced in + * dependencies ever. */ + if (mount_point_is_api(where) || mount_point_ignore(where)) + return 0; + + if (streq(fstype, "autofs")) + return 0; + + /* probably some kind of swap, ignore */ + if (!is_path(where)) + return 0; + + /* Mount unit names have to be (like all other unit names) short enough to fit into file names. This + * means there's a good chance that overly long mount point paths after mangling them to look like a + * unit name would result in unit names we don't actually consider valid. This should be OK however + * as such long mount point paths should not happen on regular systems — and if they appear + * nonetheless they are generally synthesized by software, and thus managed by that other + * software. Having such long names just means you cannot use systemd to manage those specific mount + * points, which should be an OK restriction to make. After all we don't have to be able to manage + * all mount points in the world — as long as we don't choke on them when we encounter them. */ + r = unit_name_from_path(where, ".mount", &e); + if (r < 0) { + static RateLimit rate_limit = { /* Let's log about this at warning level at most once every + * 5s. Given that we generate this whenever we read the file + * otherwise we probably shouldn't flood the logs with + * this */ + .interval = 5 * USEC_PER_SEC, + .burst = 1, + }; + + return log_struct_errno( + ratelimit_below(&rate_limit) ? LOG_WARNING : LOG_DEBUG, r, + "MESSAGE_ID=" SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE_STR, + "MOUNT_POINT=%s", where, + LOG_MESSAGE("Failed to generate valid unit name from path '%s', ignoring mount point: %m", where)); + } + + u = manager_get_unit(m, e); + if (u) + r = mount_setup_existing_unit(u, what, where, options, fstype, &flags); + else + /* First time we see this mount point meaning that it's not been initiated by a mount unit but rather + * by the sysadmin having called mount(8) directly. */ + r = mount_setup_new_unit(m, e, what, where, options, fstype, &flags, &u); + if (r < 0) + return log_warning_errno(r, "Failed to set up mount unit for '%s': %m", where); + + /* If the mount changed properties or state, let's notify our clients */ + if (flags & (MOUNT_PROC_JUST_CHANGED|MOUNT_PROC_JUST_MOUNTED)) + unit_add_to_dbus_queue(u); + + if (set_flags) + MOUNT(u)->proc_flags = flags; + + return 0; +} + +static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + int r; + + assert(m); + + r = libmount_parse(NULL, NULL, &table, &iter); + if (r < 0) + return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + struct libmnt_fs *fs; + const char *device, *path, *options, *fstype; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + + device = mnt_fs_get_source(fs); + path = mnt_fs_get_target(fs); + options = mnt_fs_get_options(fs); + fstype = mnt_fs_get_fstype(fs); + + if (!device || !path) + continue; + + device_found_node(m, device, DEVICE_FOUND_MOUNT, DEVICE_FOUND_MOUNT); + + (void) mount_setup_unit(m, device, path, options, fstype, set_flags); + } + + return 0; +} + +static void mount_shutdown(Manager *m) { + assert(m); + + m->mount_event_source = sd_event_source_unref(m->mount_event_source); + + mnt_unref_monitor(m->mount_monitor); + m->mount_monitor = NULL; +} + +static int mount_get_timeout(Unit *u, usec_t *timeout) { + Mount *m = MOUNT(u); + usec_t t; + int r; + + if (!m->timer_event_source) + return 0; + + r = sd_event_source_get_time(m->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static void mount_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Whatever happens, we know for sure that the root directory is around, and cannot go away. Let's + * unconditionally synthesize it here and mark it as perpetual. */ + + u = manager_get_unit(m, SPECIAL_ROOT_MOUNT); + if (!u) { + r = unit_new_for_name(m, sizeof(Mount), SPECIAL_ROOT_MOUNT, &u); + if (r < 0) { + log_error_errno(r, "Failed to allocate the special " SPECIAL_ROOT_MOUNT " unit: %m"); + return; + } + } + + u->perpetual = true; + MOUNT(u)->deserialized_state = MOUNT_MOUNTED; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); +} + +static bool mount_is_mounted(Mount *m) { + assert(m); + + return UNIT(m)->perpetual || FLAGS_SET(m->proc_flags, MOUNT_PROC_IS_MOUNTED); +} + +static void mount_enumerate(Manager *m) { + int r; + + assert(m); + + mnt_init_debug(0); + + if (!m->mount_monitor) { + int fd; + + m->mount_monitor = mnt_new_monitor(); + if (!m->mount_monitor) { + log_oom(); + goto fail; + } + + r = mnt_monitor_enable_kernel(m->mount_monitor, 1); + if (r < 0) { + log_error_errno(r, "Failed to enable watching of kernel mount events: %m"); + goto fail; + } + + r = mnt_monitor_enable_userspace(m->mount_monitor, 1, NULL); + if (r < 0) { + log_error_errno(r, "Failed to enable watching of userspace mount events: %m"); + goto fail; + } + + /* mnt_unref_monitor() will close the fd */ + fd = r = mnt_monitor_get_fd(m->mount_monitor); + if (r < 0) { + log_error_errno(r, "Failed to acquire watch file descriptor: %m"); + goto fail; + } + + r = sd_event_add_io(m->event, &m->mount_event_source, fd, EPOLLIN, mount_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to watch mount file descriptor: %m"); + goto fail; + } + + r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) { + log_error_errno(r, "Failed to adjust mount watch priority: %m"); + goto fail; + } + + (void) sd_event_source_set_description(m->mount_event_source, "mount-monitor-dispatch"); + } + + r = mount_load_proc_self_mountinfo(m, false); + if (r < 0) + goto fail; + + return; + +fail: + mount_shutdown(m); +} + +static int drain_libmount(Manager *m) { + bool rescan = false; + int r; + + assert(m); + + /* Drain all events and verify that the event is valid. + * + * Note that libmount also monitors /run/mount mkdir if the directory does not exist yet. The mkdir + * may generate event which is irrelevant for us. + * + * error: r < 0; valid: r == 0, false positive: r == 1 */ + do { + r = mnt_monitor_next_change(m->mount_monitor, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to drain libmount events: %m"); + if (r == 0) + rescan = true; + } while (r == 0); + + return rescan; +} + +static int mount_process_proc_self_mountinfo(Manager *m) { + _cleanup_set_free_free_ Set *around = NULL, *gone = NULL; + const char *what; + Unit *u; + int r; + + assert(m); + + r = drain_libmount(m); + if (r <= 0) + return r; + + r = mount_load_proc_self_mountinfo(m, true); + if (r < 0) { + /* Reset flags, just in case, for later calls */ + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) + MOUNT(u)->proc_flags = 0; + + return 0; + } + + manager_dispatch_load_queue(m); + + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) { + Mount *mount = MOUNT(u); + + if (!mount_is_mounted(mount)) { + + /* A mount point is not around right now. It + * might be gone, or might never have + * existed. */ + + if (mount->from_proc_self_mountinfo && + mount->parameters_proc_self_mountinfo.what) { + + /* Remember that this device might just have disappeared */ + if (set_ensure_allocated(&gone, &path_hash_ops) < 0 || + set_put_strdup(&gone, mount->parameters_proc_self_mountinfo.what) < 0) + log_oom(); /* we don't care too much about OOM here... */ + } + + mount->from_proc_self_mountinfo = false; + assert_se(update_parameters_proc_self_mountinfo(mount, NULL, NULL, NULL) >= 0); + + switch (mount->state) { + + case MOUNT_MOUNTED: + /* This has just been unmounted by somebody else, follow the state change. */ + mount_enter_dead(mount, MOUNT_SUCCESS); + break; + + case MOUNT_MOUNTING_DONE: + /* The mount command may add the corresponding proc mountinfo entry and + * then remove it because of an internal error. E.g., fuse.sshfs seems + * to do that when the connection fails. See #17617. To handle such the + * case, let's once set the state back to mounting. Then, the unit can + * correctly enter the failed state later in mount_sigchld(). */ + mount_set_state(mount, MOUNT_MOUNTING); + break; + + default: + break; + } + + } else if (mount->proc_flags & (MOUNT_PROC_JUST_MOUNTED|MOUNT_PROC_JUST_CHANGED)) { + + /* A mount point was added or changed */ + + switch (mount->state) { + + case MOUNT_DEAD: + case MOUNT_FAILED: + + /* This has just been mounted by somebody else, follow the state change, but let's + * generate a new invocation ID for this implicitly and automatically. */ + (void) unit_acquire_invocation_id(u); + mount_cycle_clear(mount); + mount_enter_mounted(mount, MOUNT_SUCCESS); + break; + + case MOUNT_MOUNTING: + mount_set_state(mount, MOUNT_MOUNTING_DONE); + break; + + default: + /* Nothing really changed, but let's + * issue an notification call + * nonetheless, in case somebody is + * waiting for this. (e.g. file system + * ro/rw remounts.) */ + mount_set_state(mount, mount->state); + break; + } + } + + if (mount_is_mounted(mount) && + mount->from_proc_self_mountinfo && + mount->parameters_proc_self_mountinfo.what) { + /* Track devices currently used */ + + if (set_ensure_allocated(&around, &path_hash_ops) < 0 || + set_put_strdup(&around, mount->parameters_proc_self_mountinfo.what) < 0) + log_oom(); + } + + /* Reset the flags for later calls */ + mount->proc_flags = 0; + } + + SET_FOREACH(what, gone) { + if (set_contains(around, what)) + continue; + + /* Let the device units know that the device is no longer mounted */ + device_found_node(m, what, 0, DEVICE_FOUND_MOUNT); + } + + return 0; +} + +static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(m); + assert(revents & EPOLLIN); + + return mount_process_proc_self_mountinfo(m); +} + +static void mount_reset_failed(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + if (m->state == MOUNT_FAILED) + mount_set_state(m, MOUNT_DEAD); + + m->result = MOUNT_SUCCESS; + m->reload_result = MOUNT_SUCCESS; + m->clean_result = MOUNT_SUCCESS; +} + +static int mount_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + Mount *m = MOUNT(u); + + assert(m); + + return unit_kill_common(u, who, signo, -1, m->control_pid, error); +} + +static int mount_control_pid(Unit *u) { + Mount *m = MOUNT(u); + + assert(m); + + return m->control_pid; +} + +static int mount_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Mount *m = MOUNT(u); + int r; + + assert(m); + assert(mask != 0); + + if (m->state != MOUNT_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&m->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + mount_unwatch_control_pid(m); + m->clean_result = MOUNT_SUCCESS; + m->control_command = NULL; + m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID; + + r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->exec_context.timeout_clean_usec)); + if (r < 0) + goto fail; + + r = unit_fork_and_watch_rm_rf(u, l, &m->control_pid); + if (r < 0) + goto fail; + + mount_set_state(m, MOUNT_CLEANING); + + return 0; + +fail: + log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m"); + m->clean_result = MOUNT_FAILURE_RESOURCES; + m->timer_event_source = sd_event_source_unref(m->timer_event_source); + return r; +} + +static int mount_can_clean(Unit *u, ExecCleanMask *ret) { + Mount *m = MOUNT(u); + + assert(m); + + return exec_context_get_clean_mask(&m->exec_context, ret); +} + +static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = { + [MOUNT_EXEC_MOUNT] = "ExecMount", + [MOUNT_EXEC_UNMOUNT] = "ExecUnmount", + [MOUNT_EXEC_REMOUNT] = "ExecRemount", +}; + +DEFINE_STRING_TABLE_LOOKUP(mount_exec_command, MountExecCommand); + +static const char* const mount_result_table[_MOUNT_RESULT_MAX] = { + [MOUNT_SUCCESS] = "success", + [MOUNT_FAILURE_RESOURCES] = "resources", + [MOUNT_FAILURE_TIMEOUT] = "timeout", + [MOUNT_FAILURE_EXIT_CODE] = "exit-code", + [MOUNT_FAILURE_SIGNAL] = "signal", + [MOUNT_FAILURE_CORE_DUMP] = "core-dump", + [MOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [MOUNT_FAILURE_PROTOCOL] = "protocol", +}; + +DEFINE_STRING_TABLE_LOOKUP(mount_result, MountResult); + +const UnitVTable mount_vtable = { + .object_size = sizeof(Mount), + .exec_context_offset = offsetof(Mount, exec_context), + .cgroup_context_offset = offsetof(Mount, cgroup_context), + .kill_context_offset = offsetof(Mount, kill_context), + .exec_runtime_offset = offsetof(Mount, exec_runtime), + .dynamic_creds_offset = offsetof(Mount, dynamic_creds), + + .sections = + "Unit\0" + "Mount\0" + "Install\0", + .private_section = "Mount", + + .can_transient = true, + .can_fail = true, + + .init = mount_init, + .load = mount_load, + .done = mount_done, + + .coldplug = mount_coldplug, + + .dump = mount_dump, + + .start = mount_start, + .stop = mount_stop, + .reload = mount_reload, + + .kill = mount_kill, + .clean = mount_clean, + .can_clean = mount_can_clean, + + .serialize = mount_serialize, + .deserialize_item = mount_deserialize_item, + + .active_state = mount_active_state, + .sub_state_to_string = mount_sub_state_to_string, + + .will_restart = unit_will_restart_default, + + .may_gc = mount_may_gc, + .is_extrinsic = mount_is_extrinsic, + + .sigchld_event = mount_sigchld_event, + + .reset_failed = mount_reset_failed, + + .control_pid = mount_control_pid, + + .bus_set_property = bus_mount_set_property, + .bus_commit_properties = bus_mount_commit_properties, + + .get_timeout = mount_get_timeout, + + .enumerate_perpetual = mount_enumerate_perpetual, + .enumerate = mount_enumerate, + .shutdown = mount_shutdown, + + .status_message_formats = { + .starting_stopping = { + [0] = "Mounting %s...", + [1] = "Unmounting %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Mounted %s.", + [JOB_FAILED] = "Failed to mount %s.", + [JOB_TIMEOUT] = "Timed out mounting %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Unmounted %s.", + [JOB_FAILED] = "Failed unmounting %s.", + [JOB_TIMEOUT] = "Timed out unmounting %s.", + }, + }, +}; diff --git a/src/core/mount.h b/src/core/mount.h new file mode 100644 index 0000000..ad0e016 --- /dev/null +++ b/src/core/mount.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Mount Mount; + +#include "kill.h" +#include "dynamic-user.h" +#include "unit.h" + +typedef enum MountExecCommand { + MOUNT_EXEC_MOUNT, + MOUNT_EXEC_UNMOUNT, + MOUNT_EXEC_REMOUNT, + _MOUNT_EXEC_COMMAND_MAX, + _MOUNT_EXEC_COMMAND_INVALID = -1 +} MountExecCommand; + +typedef enum MountResult { + MOUNT_SUCCESS, + MOUNT_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */ + MOUNT_FAILURE_TIMEOUT, + MOUNT_FAILURE_EXIT_CODE, + MOUNT_FAILURE_SIGNAL, + MOUNT_FAILURE_CORE_DUMP, + MOUNT_FAILURE_START_LIMIT_HIT, + MOUNT_FAILURE_PROTOCOL, + _MOUNT_RESULT_MAX, + _MOUNT_RESULT_INVALID = -1 +} MountResult; + +typedef struct MountParameters { + char *what; + char *options; + char *fstype; +} MountParameters; + +/* Used while looking for mount points that vanished or got added from/to /proc/self/mountinfo */ +typedef enum MountProcFlags { + MOUNT_PROC_IS_MOUNTED = 1 << 0, + MOUNT_PROC_JUST_MOUNTED = 1 << 1, + MOUNT_PROC_JUST_CHANGED = 1 << 2, +} MountProcFlags; + +struct Mount { + Unit meta; + + char *where; + + MountParameters parameters_proc_self_mountinfo; + MountParameters parameters_fragment; + + bool from_proc_self_mountinfo:1; + bool from_fragment:1; + + MountProcFlags proc_flags; + + bool sloppy_options; + + bool lazy_unmount; + bool force_unmount; + + bool read_write_only; + + MountResult result; + MountResult reload_result; + MountResult clean_result; + + mode_t directory_mode; + + usec_t timeout_usec; + + ExecCommand exec_command[_MOUNT_EXEC_COMMAND_MAX]; + + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + MountState state, deserialized_state; + + ExecCommand* control_command; + MountExecCommand control_command_id; + pid_t control_pid; + + sd_event_source *timer_event_source; + + unsigned n_retry_umount; +}; + +extern const UnitVTable mount_vtable; + +void mount_fd_event(Manager *m, int events); + +const char* mount_exec_command_to_string(MountExecCommand i) _const_; +MountExecCommand mount_exec_command_from_string(const char *s) _pure_; + +const char* mount_result_to_string(MountResult i) _const_; +MountResult mount_result_from_string(const char *s) _pure_; + +DEFINE_CAST(MOUNT, Mount); diff --git a/src/core/namespace.c b/src/core/namespace.c new file mode 100644 index 0000000..cdf427a --- /dev/null +++ b/src/core/namespace.c @@ -0,0 +1,2384 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <linux/loop.h> +#include <sched.h> +#include <stdio.h> +#include <sys/mount.h> +#include <unistd.h> +#include <linux/fs.h> + +#include "alloc-util.h" +#include "base-filesystem.h" +#include "dev-setup.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "label.h" +#include "list.h" +#include "loop-util.h" +#include "loopback-setup.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "namespace.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "user-util.h" + +#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC) + +typedef enum MountMode { + /* This is ordered by priority! */ + INACCESSIBLE, + MOUNT_IMAGES, + BIND_MOUNT, + BIND_MOUNT_RECURSIVE, + PRIVATE_TMP, + PRIVATE_TMP_READONLY, + PRIVATE_DEV, + BIND_DEV, + EMPTY_DIR, + SYSFS, + PROCFS, + READONLY, + READWRITE, + TMPFS, + READWRITE_IMPLICIT, /* Should have the lowest priority. */ + _MOUNT_MODE_MAX, +} MountMode; + +typedef struct MountEntry { + const char *path_const; /* Memory allocated on stack or static */ + MountMode mode:5; + bool ignore:1; /* Ignore if path does not exist? */ + bool has_prefix:1; /* Already is prefixed by the root dir? */ + bool read_only:1; /* Shall this mount point be read-only? */ + bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */ + bool applied:1; /* Already applied */ + char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */ + const char *source_const; /* The source path, for bind mounts or images */ + char *source_malloc; + const char *options_const;/* Mount options for tmpfs */ + char *options_malloc; + unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */ + unsigned n_followed; + LIST_HEAD(MountOptions, image_options); +} MountEntry; + +/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted + * something there already. These mounts are hence overridden by any other explicitly configured mounts. */ +static const MountEntry apivfs_table[] = { + { "/proc", PROCFS, false }, + { "/dev", BIND_DEV, false }, + { "/sys", SYSFS, false }, +}; + +/* ProtectKernelTunables= option and the related filesystem APIs */ +static const MountEntry protect_kernel_tunables_table[] = { + { "/proc/acpi", READONLY, true }, + { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */ + { "/proc/asound", READONLY, true }, + { "/proc/bus", READONLY, true }, + { "/proc/fs", READONLY, true }, + { "/proc/irq", READONLY, true }, + { "/proc/kallsyms", INACCESSIBLE, true }, + { "/proc/kcore", INACCESSIBLE, true }, + { "/proc/latency_stats", READONLY, true }, + { "/proc/mtrr", READONLY, true }, + { "/proc/scsi", READONLY, true }, + { "/proc/sys", READONLY, true }, + { "/proc/sysrq-trigger", READONLY, true }, + { "/proc/timer_stats", READONLY, true }, + { "/sys", READONLY, false }, + { "/sys/fs/bpf", READONLY, true }, + { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */ + { "/sys/fs/selinux", READWRITE_IMPLICIT, true }, + { "/sys/kernel/debug", READONLY, true }, + { "/sys/kernel/tracing", READONLY, true }, +}; + +/* ProtectKernelModules= option */ +static const MountEntry protect_kernel_modules_table[] = { +#if HAVE_SPLIT_USR + { "/lib/modules", INACCESSIBLE, true }, +#endif + { "/usr/lib/modules", INACCESSIBLE, true }, +}; + +/* ProtectKernelLogs= option */ +static const MountEntry protect_kernel_logs_table[] = { + { "/proc/kmsg", INACCESSIBLE, true }, + { "/dev/kmsg", INACCESSIBLE, true }, +}; + +/* + * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of + * system should be protected by ProtectSystem= + */ +static const MountEntry protect_home_read_only_table[] = { + { "/home", READONLY, true }, + { "/run/user", READONLY, true }, + { "/root", READONLY, true }, +}; + +/* ProtectHome=tmpfs table */ +static const MountEntry protect_home_tmpfs_table[] = { + { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, + { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, + { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME }, +}; + +/* ProtectHome=yes table */ +static const MountEntry protect_home_yes_table[] = { + { "/home", INACCESSIBLE, true }, + { "/run/user", INACCESSIBLE, true }, + { "/root", INACCESSIBLE, true }, +}; + +/* ProtectSystem=yes table */ +static const MountEntry protect_system_yes_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, +#if HAVE_SPLIT_USR + { "/lib", READONLY, true }, + { "/lib64", READONLY, true }, + { "/bin", READONLY, true }, +# if HAVE_SPLIT_BIN + { "/sbin", READONLY, true }, +# endif +#endif +}; + +/* ProtectSystem=full includes ProtectSystem=yes */ +static const MountEntry protect_system_full_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, + { "/etc", READONLY, false }, +#if HAVE_SPLIT_USR + { "/lib", READONLY, true }, + { "/lib64", READONLY, true }, + { "/bin", READONLY, true }, +# if HAVE_SPLIT_BIN + { "/sbin", READONLY, true }, +# endif +#endif +}; + +/* + * ProtectSystem=strict table. In this strict mode, we mount everything + * read-only, except for /proc, /dev, /sys which are the kernel API VFS, + * which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. + * (And of course /home and friends are also left writable, as ProtectHome= + * shall manage those, orthogonally). + */ +static const MountEntry protect_system_strict_table[] = { + { "/", READONLY, false }, + { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */ + { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */ + { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */ + { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */ + { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */ + { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */ +}; + +static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { + [INACCESSIBLE] = "inaccessible", + [BIND_MOUNT] = "bind", + [BIND_MOUNT_RECURSIVE] = "rbind", + [PRIVATE_TMP] = "private-tmp", + [PRIVATE_DEV] = "private-dev", + [BIND_DEV] = "bind-dev", + [EMPTY_DIR] = "empty", + [SYSFS] = "sysfs", + [PROCFS] = "procfs", + [READONLY] = "read-only", + [READWRITE] = "read-write", + [TMPFS] = "tmpfs", + [MOUNT_IMAGES] = "mount-images", + [READWRITE_IMPLICIT] = "rw-implicit", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode); + +static const char *mount_entry_path(const MountEntry *p) { + assert(p); + + /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that, + * otherwise the stack/static ->path field is returned. */ + + return p->path_malloc ?: p->path_const; +} + +static bool mount_entry_read_only(const MountEntry *p) { + assert(p); + + return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE, PRIVATE_TMP_READONLY); +} + +static const char *mount_entry_source(const MountEntry *p) { + assert(p); + + return p->source_malloc ?: p->source_const; +} + +static const char *mount_entry_options(const MountEntry *p) { + assert(p); + + return p->options_malloc ?: p->options_const; +} + +static void mount_entry_done(MountEntry *p) { + assert(p); + + p->path_malloc = mfree(p->path_malloc); + p->source_malloc = mfree(p->source_malloc); + p->options_malloc = mfree(p->options_malloc); + p->image_options = mount_options_free_all(p->image_options); +} + +static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) { + char **i; + + assert(p); + + /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */ + + STRV_FOREACH(i, strv) { + bool ignore = false, needs_prefix = false; + const char *e = *i; + + /* Look for any prefixes */ + if (startswith(e, "-")) { + e++; + ignore = true; + } + if (startswith(e, "+")) { + e++; + needs_prefix = true; + } + + if (!path_is_absolute(e)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path is not absolute: %s", e); + + *((*p)++) = (MountEntry) { + .path_const = e, + .mode = mode, + .ignore = ignore, + .has_prefix = !needs_prefix && !forcibly_require_prefix, + }; + } + + return 0; +} + +static int append_empty_dir_mounts(MountEntry **p, char **strv) { + char **i; + + assert(p); + + /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the + * "/private/" boundary directories for DynamicUser=1. */ + + STRV_FOREACH(i, strv) { + + *((*p)++) = (MountEntry) { + .path_const = *i, + .mode = EMPTY_DIR, + .ignore = false, + .read_only = true, + .options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST, + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + }; + } + + return 0; +} + +static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) { + size_t i; + + assert(p); + + for (i = 0; i < n; i++) { + const BindMount *b = binds + i; + + *((*p)++) = (MountEntry) { + .path_const = b->destination, + .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT, + .read_only = b->read_only, + .nosuid = b->nosuid, + .source_const = b->source, + .ignore = b->ignore_enoent, + }; + } + + return 0; +} + +static int append_mount_images(MountEntry **p, const MountImage *mount_images, size_t n) { + assert(p); + + for (size_t i = 0; i < n; i++) { + const MountImage *m = mount_images + i; + + *((*p)++) = (MountEntry) { + .path_const = m->destination, + .mode = MOUNT_IMAGES, + .source_const = m->source, + .image_options = m->mount_options, + .ignore = m->ignore_enoent, + }; + } + + return 0; +} + +static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) { + assert(p); + + for (size_t i = 0; i < n; i++) { + const TemporaryFileSystem *t = tmpfs + i; + _cleanup_free_ char *o = NULL, *str = NULL; + unsigned long flags; + bool ro = false; + int r; + + if (!path_is_absolute(t->path)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path is not absolute: %s", + t->path); + + str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options); + if (!str) + return -ENOMEM; + + r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o); + if (r < 0) + return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); + + ro = flags & MS_RDONLY; + if (ro) + flags ^= MS_RDONLY; + + *((*p)++) = (MountEntry) { + .path_const = t->path, + .mode = TMPFS, + .read_only = ro, + .options_malloc = TAKE_PTR(o), + .flags = flags, + }; + } + + return 0; +} + +static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) { + size_t i; + + assert(p); + assert(mounts); + + /* Adds a list of static pre-defined entries */ + + for (i = 0; i < n; i++) + *((*p)++) = (MountEntry) { + .path_const = mount_entry_path(mounts+i), + .mode = mounts[i].mode, + .ignore = mounts[i].ignore || ignore_protect, + }; + + return 0; +} + +static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) { + assert(p); + + switch (protect_home) { + + case PROTECT_HOME_NO: + return 0; + + case PROTECT_HOME_READ_ONLY: + return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect); + + case PROTECT_HOME_TMPFS: + return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect); + + case PROTECT_HOME_YES: + return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect); + + default: + assert_not_reached("Unexpected ProtectHome= value"); + } +} + +static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) { + assert(p); + + switch (protect_system) { + + case PROTECT_SYSTEM_NO: + return 0; + + case PROTECT_SYSTEM_STRICT: + return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect); + + case PROTECT_SYSTEM_YES: + return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect); + + case PROTECT_SYSTEM_FULL: + return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect); + + default: + assert_not_reached("Unexpected ProtectSystem= value"); + } +} + +static int mount_path_compare(const MountEntry *a, const MountEntry *b) { + int d; + + /* If the paths are not equal, then order prefixes first */ + d = path_compare(mount_entry_path(a), mount_entry_path(b)); + if (d != 0) + return d; + + /* If the paths are equal, check the mode */ + return CMP((int) a->mode, (int) b->mode); +} + +static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) { + size_t i; + + /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */ + + for (i = 0; i < n; i++) { + char *s; + + if (m[i].has_prefix) + continue; + + s = path_join(root_directory, mount_entry_path(m+i)); + if (!s) + return -ENOMEM; + + free_and_replace(m[i].path_malloc, s); + m[i].has_prefix = true; + } + + return 0; +} + +static void drop_duplicates(MountEntry *m, size_t *n) { + MountEntry *f, *t, *previous; + + assert(m); + assert(n); + + /* Drops duplicate entries. Expects that the array is properly ordered already. */ + + for (f = m, t = m, previous = NULL; f < m + *n; f++) { + + /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare() + * above. Note that we only drop duplicates that haven't been mounted yet. */ + if (previous && + path_equal(mount_entry_path(f), mount_entry_path(previous)) && + !f->applied && !previous->applied) { + log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode)); + previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */ + mount_entry_done(f); + continue; + } + + *t = *f; + previous = t; + t++; + } + + *n = t - m; +} + +static void drop_inaccessible(MountEntry *m, size_t *n) { + MountEntry *f, *t; + const char *clear = NULL; + + assert(m); + assert(n); + + /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly + * ordered already. */ + + for (f = m, t = m; f < m + *n; f++) { + + /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop + * it, as inaccessible paths really should drop the entire subtree. */ + if (clear && path_startswith(mount_entry_path(f), clear)) { + log_debug("%s is masked by %s.", mount_entry_path(f), clear); + mount_entry_done(f); + continue; + } + + clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL; + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_nop(MountEntry *m, size_t *n) { + MountEntry *f, *t; + + assert(m); + assert(n); + + /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the + * list is ordered by prefixes. */ + + for (f = m, t = m; f < m + *n; f++) { + + /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */ + if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) { + MountEntry *p; + bool found = false; + + /* Now let's find the first parent of the entry we are looking at. */ + for (p = t-1; p >= m; p--) { + if (path_startswith(mount_entry_path(f), mount_entry_path(p))) { + found = true; + break; + } + } + + /* We found it, let's see if it's the same mode, if so, we can drop this entry */ + if (found && p->mode == f->mode) { + log_debug("%s (%s) is made redundant by %s (%s)", + mount_entry_path(f), mount_mode_to_string(f->mode), + mount_entry_path(p), mount_mode_to_string(p->mode)); + mount_entry_done(f); + continue; + } + } + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) { + MountEntry *f, *t; + + assert(m); + assert(n); + + /* Nothing to do */ + if (!root_directory) + return; + + /* Drops all mounts that are outside of the root directory. */ + + for (f = m, t = m; f < m + *n; f++) { + + if (!path_startswith(mount_entry_path(f), root_directory)) { + log_debug("%s is outside of root directory.", mount_entry_path(f)); + mount_entry_done(f); + continue; + } + + *t = *f; + t++; + } + + *n = t - m; +} + +static int clone_device_node( + const char *d, + const char *temporary_mount, + bool *make_devnode) { + + _cleanup_free_ char *sl = NULL; + const char *dn, *bn, *t; + struct stat st; + int r; + + if (stat(d, &st) < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d); + return -ENXIO; + } + + return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d); + } + + if (!S_ISBLK(st.st_mode) && + !S_ISCHR(st.st_mode)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Device node '%s' to clone is not a device node, ignoring.", + d); + + dn = strjoina(temporary_mount, d); + + /* First, try to create device node properly */ + if (*make_devnode) { + mac_selinux_create_file_prepare(d, st.st_mode); + r = mknod(dn, st.st_mode, st.st_rdev); + mac_selinux_create_file_clear(); + if (r >= 0) + goto add_symlink; + if (errno != EPERM) + return log_debug_errno(errno, "mknod failed for %s: %m", d); + + /* This didn't work, let's not try this again for the next iterations. */ + *make_devnode = false; + } + + /* We're about to fall back to bind-mounting the device + * node. So create a dummy bind-mount target. + * Do not prepare device-node SELinux label (see issue 13762) */ + r = mknod(dn, S_IFREG, 0); + if (r < 0 && errno != EEXIST) + return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d); + + /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard + * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or + * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */ + r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL); + if (r < 0) + return r; + +add_symlink: + bn = path_startswith(d, "/dev/"); + if (!bn) + return 0; + + /* Create symlinks like /dev/char/1:9 → ../urandom */ + if (asprintf(&sl, "%s/dev/%s/%u:%u", + temporary_mount, + S_ISCHR(st.st_mode) ? "char" : "block", + major(st.st_rdev), minor(st.st_rdev)) < 0) + return log_oom(); + + (void) mkdir_parents(sl, 0755); + + t = strjoina("../", bn); + if (symlink(t, sl) < 0) + log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl); + + return 0; +} + +static int mount_private_dev(MountEntry *m) { + static const char devnodes[] = + "/dev/null\0" + "/dev/zero\0" + "/dev/full\0" + "/dev/random\0" + "/dev/urandom\0" + "/dev/tty\0"; + + char temporary_mount[] = "/tmp/namespace-dev-XXXXXX"; + const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL; + bool can_mknod = true; + _cleanup_umask_ mode_t u; + int r; + + assert(m); + + u = umask(0000); + + if (!mkdtemp(temporary_mount)) + return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount); + + dev = strjoina(temporary_mount, "/dev"); + (void) mkdir(dev, 0755); + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV); + if (r < 0) + goto fail; + + r = label_fix_container(dev, "/dev", 0); + if (r < 0) { + log_debug_errno(errno, "Failed to fix label of '%s' as /dev: %m", dev); + goto fail; + } + + devpts = strjoina(temporary_mount, "/dev/pts"); + (void) mkdir(devpts, 0755); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL); + if (r < 0) + goto fail; + + /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx. + * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible. + * Thus, in that case make a clone. + * In nspawn and other containers it will be a symlink, in that case make it a symlink. */ + r = is_symlink("/dev/ptmx"); + if (r < 0) { + log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m"); + goto fail; + } else if (r > 0) { + devptmx = strjoina(temporary_mount, "/dev/ptmx"); + if (symlink("pts/ptmx", devptmx) < 0) { + r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx); + goto fail; + } + } else { + r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod); + if (r < 0) + goto fail; + } + + devshm = strjoina(temporary_mount, "/dev/shm"); + (void) mkdir(devshm, 0755); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL); + if (r < 0) + goto fail; + + devmqueue = strjoina(temporary_mount, "/dev/mqueue"); + (void) mkdir(devmqueue, 0755); + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL); + + devhugepages = strjoina(temporary_mount, "/dev/hugepages"); + (void) mkdir(devhugepages, 0755); + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL); + + devlog = strjoina(temporary_mount, "/dev/log"); + if (symlink("/run/systemd/journal/dev-log", devlog) < 0) + log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog); + + NULSTR_FOREACH(d, devnodes) { + r = clone_device_node(d, temporary_mount, &can_mknod); + /* ENXIO means the *source* is not a device file, skip creation in that case */ + if (r < 0 && r != -ENXIO) + goto fail; + } + + r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID); + if (r < 0) + log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount); + + /* Create the /dev directory if missing. It is more likely to be + * missing when the service is started with RootDirectory. This is + * consistent with mount units creating the mount points when missing. + */ + (void) mkdir_p_label(mount_entry_path(m), 0755); + + /* Unmount everything in old /dev */ + r = umount_recursive(mount_entry_path(m), 0); + if (r < 0) + log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m)); + + r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL); + if (r < 0) + goto fail; + + (void) rmdir(dev); + (void) rmdir(temporary_mount); + + return 0; + +fail: + if (devpts) + (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW); + + if (devshm) + (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW); + + if (devhugepages) + (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW); + + if (devmqueue) + (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW); + + (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW); + (void) rmdir(dev); + (void) rmdir(temporary_mount); + + return r; +} + +static int mount_bind_dev(const MountEntry *m) { + int r; + + assert(m); + + /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's + * /dev. This is only used when RootDirectory= is set. */ + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); + if (r > 0) /* make this a NOP if /dev is already a mount point */ + return 0; + + r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + return 1; +} + +static int mount_sysfs(const MountEntry *m) { + int r; + + assert(m); + + (void) mkdir_p_label(mount_entry_path(m), 0755); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); + if (r > 0) /* make this a NOP if /sys is already a mount point */ + return 0; + + /* Bind mount the host's version so that we get all child mounts of it, too. */ + r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + return 1; +} + +static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) { + const char *entry_path; + int r; + + assert(m); + assert(ns_info); + + entry_path = mount_entry_path(m); + + /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in + * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by + * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything + * mounted on /proc/ first. */ + + (void) mkdir_p_label(entry_path, 0755); + (void) umount_recursive(entry_path, 0); + + if (ns_info->protect_proc != PROTECT_PROC_DEFAULT || + ns_info->proc_subset != PROC_SUBSET_ALL) { + _cleanup_free_ char *opts = NULL; + + /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it + * pretended to be per-instance but actually was per-namespace), hence let's make use of it + * if requested. To make sure this logic succeeds only on kernels where hidepid= is + * per-instance, we'll exclusively use the textual value for hidepid=, since support was + * added in the same commit: if it's supported it is thus also per-instance. */ + + opts = strjoin("hidepid=", + ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" : + protect_proc_to_string(ns_info->protect_proc), + ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : ""); + if (!opts) + return -ENOMEM; + + r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r < 0) { + if (r != -EINVAL) + return r; + + /* If this failed with EINVAL then this likely means the textual hidepid= stuff is + * not supported by the kernel, and thus the per-instance hidepid= neither, which + * means we really don't want to use it, since it would affect our host's /proc + * mount. Hence let's gracefully fallback to a classic, unrestricted version. */ + } else + return 1; + } + + r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) + return r; + + return 1; +} + +static int mount_tmpfs(const MountEntry *m) { + const char *entry_path, *inner_path; + int r; + + assert(m); + + entry_path = mount_entry_path(m); + inner_path = m->path_const; + + /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */ + + (void) mkdir_p_label(entry_path, 0755); + (void) umount_recursive(entry_path, 0); + + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m)); + if (r < 0) + return r; + + r = label_fix_container(entry_path, inner_path, 0); + if (r < 0) + return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path); + + return 1; +} + +static int mount_images(const MountEntry *m) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + DissectImageFlags dissect_image_flags; + int r; + + assert(m); + + r = verity_settings_load(&verity, mount_entry_source(m), NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + dissect_image_flags = + (m->read_only ? DISSECT_IMAGE_READ_ONLY : 0) | + (verity.data_path ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0); + + r = loop_device_make_by_path( + mount_entry_source(m), + m->read_only ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + verity.data_path ? 0 : LO_FLAGS_PARTSCAN, + &loop_device); + if (r < 0) + return log_debug_errno(r, "Failed to create loop device for image: %m"); + + r = dissect_image( + loop_device->fd, + &verity, + m->image_options, + dissect_image_flags, + &dissected_image); + /* No partition table? Might be a single-filesystem image, try again */ + if (!verity.data_path && r == -ENOPKG) + r = dissect_image( + loop_device->fd, + &verity, + m->image_options, + dissect_image_flags|DISSECT_IMAGE_NO_PARTITION_TABLE, + &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); + + r = dissected_image_decrypt( + dissected_image, + NULL, + &verity, + dissect_image_flags, + &decrypted_image); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + + r = mkdir_p_label(mount_entry_path(m), 0755); + if (r < 0) + return log_debug_errno(r, "Failed to create destination directory %s: %m", mount_entry_path(m)); + r = umount_recursive(mount_entry_path(m), 0); + if (r < 0) + return log_debug_errno(r, "Failed to umount under destination directory %s: %m", mount_entry_path(m)); + + r = dissected_image_mount(dissected_image, mount_entry_path(m), UID_INVALID, dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to mount image: %m"); + + if (decrypted_image) { + r = decrypted_image_relinquish(decrypted_image); + if (r < 0) + return log_debug_errno(r, "Failed to relinquish decrypted image: %m"); + } + + loop_device_relinquish(loop_device); + + return 1; +} + +static int follow_symlink( + const char *root_directory, + MountEntry *m) { + + _cleanup_free_ char *target = NULL; + int r; + + /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we + * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at + * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the + * end and already have a fully normalized name. */ + + r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m)); + if (r > 0) /* Reached the end, nothing more to resolve */ + return 1; + + if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */ + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "Symlink loop on '%s'.", + mount_entry_path(m)); + + log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target); + + free_and_replace(m->path_malloc, target); + m->has_prefix = true; + + m->n_followed ++; + + return 0; +} + +static int apply_mount( + const char *root_directory, + MountEntry *m, + const NamespaceInfo *ns_info) { + + _cleanup_free_ char *inaccessible = NULL; + bool rbind = true, make = false; + const char *what; + int r; + + assert(m); + assert(ns_info); + + log_debug("Applying namespace mount on %s", mount_entry_path(m)); + + switch (m->mode) { + + case INACCESSIBLE: { + _cleanup_free_ char *tmp = NULL; + const char *runtime_dir; + struct stat target; + + /* First, get rid of everything that is below if there + * is anything... Then, overmount it with an + * inaccessible path. */ + (void) umount_recursive(mount_entry_path(m), 0); + + if (lstat(mount_entry_path(m), &target) < 0) { + if (errno == ENOENT && m->ignore) + return 0; + + return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", + mount_entry_path(m)); + } + + if (geteuid() == 0) + runtime_dir = "/run"; + else { + if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0) + return -ENOMEM; + + runtime_dir = tmp; + } + + r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible); + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "File type not supported for inaccessible mounts. Note that symlinks are not allowed"); + what = inaccessible; + break; + } + + case READONLY: + case READWRITE: + case READWRITE_IMPLICIT: + r = path_is_mount_point(mount_entry_path(m), root_directory, 0); + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", + mount_entry_path(m)); + if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY + * bit for the mount point if needed. */ + return 0; + /* This isn't a mount point yet, let's make it one. */ + what = mount_entry_path(m); + break; + + case BIND_MOUNT: + rbind = false; + + _fallthrough_; + case BIND_MOUNT_RECURSIVE: { + _cleanup_free_ char *chased = NULL; + + /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note + * that bind mount source paths are always relative to the host root, hence we pass NULL as + * root directory to chase_symlinks() here. */ + + r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL); + if (r == -ENOENT && m->ignore) { + log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m)); + return 0; + } + if (r < 0) + return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m)); + + log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased); + + free_and_replace(m->source_malloc, chased); + + what = mount_entry_source(m); + make = true; + break; + } + + case EMPTY_DIR: + case TMPFS: + return mount_tmpfs(m); + + case PRIVATE_TMP: + case PRIVATE_TMP_READONLY: + what = mount_entry_source(m); + make = true; + break; + + case PRIVATE_DEV: + return mount_private_dev(m); + + case BIND_DEV: + return mount_bind_dev(m); + + case SYSFS: + return mount_sysfs(m); + + case PROCFS: + return mount_procfs(m, ns_info); + + case MOUNT_IMAGES: + return mount_images(m); + + default: + assert_not_reached("Unknown mode"); + } + + assert(what); + + r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL); + if (r < 0) { + bool try_again = false; + + if (r == -ENOENT && make) { + struct stat st; + + /* Hmm, either the source or the destination are missing. Let's see if we can create + the destination, then try again. */ + + if (stat(what, &st) < 0) + log_error_errno(errno, "Mount point source '%s' is not accessible: %m", what); + else { + int q; + + (void) mkdir_parents(mount_entry_path(m), 0755); + + if (S_ISDIR(st.st_mode)) + q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0; + else + q = touch(mount_entry_path(m)); + + if (q < 0) + log_error_errno(q, "Failed to create destination mount point node '%s': %m", + mount_entry_path(m)); + else + try_again = true; + } + } + + if (try_again) + r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL); + if (r < 0) + return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); + } + + log_debug("Successfully mounted %s to %s", what, mount_entry_path(m)); + return 0; +} + +static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) { + unsigned long new_flags = 0, flags_mask = 0; + bool submounts = false; + int r = 0; + + assert(m); + assert(proc_self_mountinfo); + + if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) { + new_flags |= MS_RDONLY; + flags_mask |= MS_RDONLY; + } + + if (m->nosuid) { + new_flags |= MS_NOSUID; + flags_mask |= MS_NOSUID; + } + + if (flags_mask == 0) /* No Change? */ + return 0; + + /* We generally apply these changes recursively, except for /dev, and the cases we know there's + * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the + * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace + * and running Linux <= 4.17. */ + submounts = + mount_entry_read_only(m) && + !IN_SET(m->mode, EMPTY_DIR, TMPFS); + if (submounts) + r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo); + else + r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo); + + /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked + * read-only already stays this way. This improves compatibility with container managers, where we + * won't attempt to undo read-only mounts already applied. */ + + if (r == -ENOENT && m->ignore) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m), + submounts ? " and its submounts" : ""); + return 0; +} + +static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { + assert(ns_info); + + /* + * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, + * since to protect the API VFS mounts, they need to be around in the + * first place... + */ + + return ns_info->mount_apivfs || + ns_info->protect_control_groups || + ns_info->protect_kernel_tunables || + ns_info->protect_proc != PROTECT_PROC_DEFAULT || + ns_info->proc_subset != PROC_SUBSET_ALL; +} + +static size_t namespace_calculate_mounts( + const NamespaceInfo *ns_info, + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, + char** empty_directories, + size_t n_bind_mounts, + size_t n_temporary_filesystems, + size_t n_mount_images, + const char* tmp_dir, + const char* var_tmp_dir, + const char *creds_path, + const char* log_namespace) { + + size_t protect_home_cnt; + size_t protect_system_cnt = + (ns_info->protect_system == PROTECT_SYSTEM_STRICT ? + ELEMENTSOF(protect_system_strict_table) : + ((ns_info->protect_system == PROTECT_SYSTEM_FULL) ? + ELEMENTSOF(protect_system_full_table) : + ((ns_info->protect_system == PROTECT_SYSTEM_YES) ? + ELEMENTSOF(protect_system_yes_table) : 0))); + + protect_home_cnt = + (ns_info->protect_home == PROTECT_HOME_YES ? + ELEMENTSOF(protect_home_yes_table) : + ((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ? + ELEMENTSOF(protect_home_read_only_table) : + ((ns_info->protect_home == PROTECT_HOME_TMPFS) ? + ELEMENTSOF(protect_home_tmpfs_table) : 0))); + + return !!tmp_dir + !!var_tmp_dir + + strv_length(read_write_paths) + + strv_length(read_only_paths) + + strv_length(inaccessible_paths) + + strv_length(empty_directories) + + n_bind_mounts + + n_mount_images + + n_temporary_filesystems + + ns_info->private_dev + + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + + (ns_info->protect_kernel_logs ? ELEMENTSOF(protect_kernel_logs_table) : 0) + + (ns_info->protect_control_groups ? 1 : 0) + + protect_home_cnt + protect_system_cnt + + (ns_info->protect_hostname ? 2 : 0) + + (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) + + (creds_path ? 2 : 1) + + !!log_namespace; +} + +static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) { + assert(root_directory); + assert(n_mounts); + assert(mounts || *n_mounts == 0); + + typesafe_qsort(mounts, *n_mounts, mount_path_compare); + + drop_duplicates(mounts, n_mounts); + drop_outside_root(root_directory, mounts, n_mounts); + drop_inaccessible(mounts, n_mounts); + drop_nop(mounts, n_mounts); +} + +static bool root_read_only( + char **read_only_paths, + ProtectSystem protect_system) { + + /* Determine whether the root directory is going to be read-only given the configured settings. */ + + if (protect_system == PROTECT_SYSTEM_STRICT) + return true; + + if (prefixed_path_strv_contains(read_only_paths, "/")) + return true; + + return false; +} + +static bool home_read_only( + char** read_only_paths, + char** inaccessible_paths, + char** empty_directories, + const BindMount *bind_mounts, + size_t n_bind_mounts, + const TemporaryFileSystem *temporary_filesystems, + size_t n_temporary_filesystems, + ProtectHome protect_home) { + + size_t i; + + /* Determine whether the /home directory is going to be read-only given the configured settings. Yes, + * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple + * settings. */ + + if (protect_home != PROTECT_HOME_NO) + return true; + + if (prefixed_path_strv_contains(read_only_paths, "/home") || + prefixed_path_strv_contains(inaccessible_paths, "/home") || + prefixed_path_strv_contains(empty_directories, "/home")) + return true; + + for (i = 0; i < n_temporary_filesystems; i++) + if (path_equal(temporary_filesystems[i].path, "/home")) + return true; + + /* If /home is overmounted with some dir from the host it's not writable. */ + for (i = 0; i < n_bind_mounts; i++) + if (path_equal(bind_mounts[i].destination, "/home")) + return true; + + return false; +} + +static int verity_settings_prepare( + VeritySettings *verity, + const char *root_image, + const void *root_hash, + size_t root_hash_size, + const char *root_hash_path, + const void *root_hash_sig, + size_t root_hash_sig_size, + const char *root_hash_sig_path, + const char *verity_data_path) { + + int r; + + assert(verity); + + if (root_hash) { + void *d; + + d = memdup(root_hash, root_hash_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash, d); + verity->root_hash_size = root_hash_size; + verity->designator = PARTITION_ROOT; + } + + if (root_hash_sig) { + void *d; + + d = memdup(root_hash_sig, root_hash_sig_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash_sig, d); + verity->root_hash_sig_size = root_hash_sig_size; + verity->designator = PARTITION_ROOT; + } + + if (verity_data_path) { + r = free_and_strdup(&verity->data_path, verity_data_path); + if (r < 0) + return r; + } + + r = verity_settings_load( + verity, + root_image, + root_hash_path, + root_hash_sig_path); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + return 0; +} + +int setup_namespace( + const char* root_directory, + const char* root_image, + const MountOptions *root_image_options, + const NamespaceInfo *ns_info, + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, + char** empty_directories, + const BindMount *bind_mounts, + size_t n_bind_mounts, + const TemporaryFileSystem *temporary_filesystems, + size_t n_temporary_filesystems, + const MountImage *mount_images, + size_t n_mount_images, + const char* tmp_dir, + const char* var_tmp_dir, + const char *creds_path, + const char *log_namespace, + unsigned long mount_flags, + const void *root_hash, + size_t root_hash_size, + const char *root_hash_path, + const void *root_hash_sig, + size_t root_hash_sig_size, + const char *root_hash_sig_path, + const char *verity_data_path, + DissectImageFlags dissect_image_flags, + char **error_path) { + + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + MountEntry *m = NULL, *mounts = NULL; + bool require_prefix = false; + const char *root; + size_t n_mounts; + int r; + + assert(ns_info); + + if (mount_flags == 0) + mount_flags = MS_SHARED; + + if (root_image) { + dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT; + + /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */ + if (root_read_only(read_only_paths, + ns_info->protect_system) && + home_read_only(read_only_paths, inaccessible_paths, empty_directories, + bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems, + ns_info->protect_home) && + strv_isempty(read_write_paths)) + dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; + + r = verity_settings_prepare( + &verity, + root_image, + root_hash, root_hash_size, root_hash_path, + root_hash_sig, root_hash_sig_size, root_hash_sig_path, + verity_data_path); + if (r < 0) + return r; + + SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path); + + r = loop_device_make_by_path( + root_image, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + &loop_device); + if (r < 0) + return log_debug_errno(r, "Failed to create loop device for root image: %m"); + + r = dissect_image( + loop_device->fd, + &verity, + root_image_options, + dissect_image_flags, + &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); + + r = dissected_image_decrypt( + dissected_image, + NULL, + &verity, + dissect_image_flags, + &decrypted_image); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + } + + if (root_directory) + root = root_directory; + else { + /* /run/systemd should have been created by PID 1 early on already, but in some cases, like + * when running tests (test-execute), it might not have been created yet so let's make sure + * we create it if it doesn't already exist. */ + (void) mkdir_p_label("/run/systemd", 0755); + + /* Always create the mount namespace in a temporary directory, instead of operating + * directly in the root. The temporary directory prevents any mounts from being + * potentially obscured my other mounts we already applied. + * We use the same mount point for all images, which is safe, since they all live + * in their own namespaces after all, and hence won't see each other. */ + + root = "/run/systemd/unit-root"; + (void) mkdir_label(root, 0700); + require_prefix = true; + } + + n_mounts = namespace_calculate_mounts( + ns_info, + read_write_paths, + read_only_paths, + inaccessible_paths, + empty_directories, + n_bind_mounts, + n_temporary_filesystems, + n_mount_images, + tmp_dir, var_tmp_dir, + creds_path, + log_namespace); + + if (n_mounts > 0) { + m = mounts = new0(MountEntry, n_mounts); + if (!mounts) + return -ENOMEM; + + r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix); + if (r < 0) + goto finish; + + r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix); + if (r < 0) + goto finish; + + r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix); + if (r < 0) + goto finish; + + r = append_empty_dir_mounts(&m, empty_directories); + if (r < 0) + goto finish; + + r = append_bind_mounts(&m, bind_mounts, n_bind_mounts); + if (r < 0) + goto finish; + + r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems); + if (r < 0) + goto finish; + + if (tmp_dir) { + bool ro = streq(tmp_dir, RUN_SYSTEMD_EMPTY); + + *(m++) = (MountEntry) { + .path_const = "/tmp", + .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP, + .source_const = tmp_dir, + }; + } + + if (var_tmp_dir) { + bool ro = streq(var_tmp_dir, RUN_SYSTEMD_EMPTY); + + *(m++) = (MountEntry) { + .path_const = "/var/tmp", + .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP, + .source_const = var_tmp_dir, + }; + } + + r = append_mount_images(&m, mount_images, n_mount_images); + if (r < 0) + goto finish; + + if (ns_info->private_dev) + *(m++) = (MountEntry) { + .path_const = "/dev", + .mode = PRIVATE_DEV, + .flags = DEV_MOUNT_OPTIONS, + }; + + if (ns_info->protect_kernel_tunables) { + r = append_static_mounts(&m, + protect_kernel_tunables_table, + ELEMENTSOF(protect_kernel_tunables_table), + ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + if (ns_info->protect_kernel_modules) { + r = append_static_mounts(&m, + protect_kernel_modules_table, + ELEMENTSOF(protect_kernel_modules_table), + ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + if (ns_info->protect_kernel_logs) { + r = append_static_mounts(&m, + protect_kernel_logs_table, + ELEMENTSOF(protect_kernel_logs_table), + ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + if (ns_info->protect_control_groups) + *(m++) = (MountEntry) { + .path_const = "/sys/fs/cgroup", + .mode = READONLY, + }; + + r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + + r = append_protect_system(&m, ns_info->protect_system, false); + if (r < 0) + goto finish; + + if (namespace_info_mount_apivfs(ns_info)) { + r = append_static_mounts(&m, + apivfs_table, + ELEMENTSOF(apivfs_table), + ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + + if (ns_info->protect_hostname) { + *(m++) = (MountEntry) { + .path_const = "/proc/sys/kernel/hostname", + .mode = READONLY, + }; + *(m++) = (MountEntry) { + .path_const = "/proc/sys/kernel/domainname", + .mode = READONLY, + }; + } + + if (creds_path) { + /* If our service has a credentials store configured, then bind that one in, but hide + * everything else. */ + + *(m++) = (MountEntry) { + .path_const = "/run/credentials", + .mode = TMPFS, + .read_only = true, + .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, + .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC, + }; + + *(m++) = (MountEntry) { + .path_const = creds_path, + .mode = BIND_MOUNT, + .read_only = true, + .source_const = creds_path, + }; + } else { + /* If our service has no credentials store configured, then make the whole + * credentials tree inaccessible wholesale. */ + + *(m++) = (MountEntry) { + .path_const = "/run/credentials", + .mode = INACCESSIBLE, + .ignore = true, + }; + } + + if (log_namespace) { + _cleanup_free_ char *q; + + q = strjoin("/run/systemd/journal.", log_namespace); + if (!q) { + r = -ENOMEM; + goto finish; + } + + *(m++) = (MountEntry) { + .path_const = "/run/systemd/journal", + .mode = BIND_MOUNT_RECURSIVE, + .read_only = true, + .source_malloc = TAKE_PTR(q), + }; + } + + assert(mounts + n_mounts == m); + + /* Prepend the root directory where that's necessary */ + r = prefix_where_needed(mounts, n_mounts, root); + if (r < 0) + goto finish; + + normalize_mounts(root, mounts, &n_mounts); + } + + /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */ + + if (unshare(CLONE_NEWNS) < 0) { + r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m"); + if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS)) + /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter + * in place that doesn't allow us to create namespaces (or a missing cap), then + * propagate a recognizable error back, which the caller can use to detect this case + * (and only this) and optionally continue without namespacing applied. */ + r = -ENOANO; + + goto finish; + } + + /* Remount / as SLAVE so that nothing now mounted in the namespace + * shows up in the parent */ + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { + r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m"); + goto finish; + } + + if (root_image) { + /* A root image is specified, mount it to the right place */ + r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags); + if (r < 0) { + log_debug_errno(r, "Failed to mount root image: %m"); + goto finish; + } + + if (decrypted_image) { + r = decrypted_image_relinquish(decrypted_image); + if (r < 0) { + log_debug_errno(r, "Failed to relinquish decrypted image: %m"); + goto finish; + } + } + + loop_device_relinquish(loop_device); + + } else if (root_directory) { + + /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */ + r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW); + if (r < 0) { + log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root); + goto finish; + } + if (r == 0) { + r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + goto finish; + } + + } else { + /* Let's mount the main root directory to the root directory to use */ + r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + goto finish; + } + + /* Try to set up the new root directory before mounting anything else there. */ + if (root_image || root_directory) + (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); + + if (n_mounts > 0) { + _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; + _cleanup_free_ char **deny_list = NULL; + size_t j; + + /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of + * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */ + proc_self_mountinfo = fopen("/proc/self/mountinfo", "re"); + if (!proc_self_mountinfo) { + r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m"); + if (error_path) + *error_path = strdup("/proc/self/mountinfo"); + goto finish; + } + + /* First round, establish all mounts we need */ + for (;;) { + bool again = false; + + for (m = mounts; m < mounts + n_mounts; ++m) { + + if (m->applied) + continue; + + r = follow_symlink(root, m); + if (r < 0) { + if (error_path && mount_entry_path(m)) + *error_path = strdup(mount_entry_path(m)); + goto finish; + } + if (r == 0) { + /* We hit a symlinked mount point. The entry got rewritten and might + * point to a very different place now. Let's normalize the changed + * list, and start from the beginning. After all to mount the entry + * at the new location we might need some other mounts first */ + again = true; + break; + } + + r = apply_mount(root, m, ns_info); + if (r < 0) { + if (error_path && mount_entry_path(m)) + *error_path = strdup(mount_entry_path(m)); + goto finish; + } + + m->applied = true; + } + + if (!again) + break; + + normalize_mounts(root, mounts, &n_mounts); + } + + /* Create a deny list we can pass to bind_mount_recursive() */ + deny_list = new(char*, n_mounts+1); + if (!deny_list) { + r = -ENOMEM; + goto finish; + } + for (j = 0; j < n_mounts; j++) + deny_list[j] = (char*) mount_entry_path(mounts+j); + deny_list[j] = NULL; + + /* Second round, flip the ro bits if necessary. */ + for (m = mounts; m < mounts + n_mounts; ++m) { + r = make_read_only(m, deny_list, proc_self_mountinfo); + if (r < 0) { + if (error_path && mount_entry_path(m)) + *error_path = strdup(mount_entry_path(m)); + goto finish; + } + } + } + + /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ + r = mount_move_root(root); + if (r < 0) { + log_debug_errno(r, "Failed to mount root with MS_MOVE: %m"); + goto finish; + } + + /* Remount / as the desired mode. Note that this will not + * reestablish propagation from our side to the host, since + * what's disconnected is disconnected. */ + if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { + r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m"); + goto finish; + } + + r = 0; + +finish: + if (n_mounts > 0) + for (m = mounts; m < mounts + n_mounts; m++) + mount_entry_done(m); + + free(mounts); + + return r; +} + +void bind_mount_free_many(BindMount *b, size_t n) { + size_t i; + + assert(b || n == 0); + + for (i = 0; i < n; i++) { + free(b[i].source); + free(b[i].destination); + } + + free(b); +} + +int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) { + _cleanup_free_ char *s = NULL, *d = NULL; + BindMount *c; + + assert(b); + assert(n); + assert(item); + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + d = strdup(item->destination); + if (!d) + return -ENOMEM; + + c = reallocarray(*b, *n + 1, sizeof(BindMount)); + if (!c) + return -ENOMEM; + + *b = c; + + c[(*n) ++] = (BindMount) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .read_only = item->read_only, + .nosuid = item->nosuid, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + + return 0; +} + +MountImage* mount_image_free_many(MountImage *m, size_t *n) { + size_t i; + + assert(n); + assert(m || *n == 0); + + for (i = 0; i < *n; i++) { + free(m[i].source); + free(m[i].destination); + mount_options_free_all(m[i].mount_options); + } + + free(m); + *n = 0; + return NULL; +} + +int mount_image_add(MountImage **m, size_t *n, const MountImage *item) { + _cleanup_free_ char *s = NULL, *d = NULL; + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + MountOptions *i; + MountImage *c; + + assert(m); + assert(n); + assert(item); + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + d = strdup(item->destination); + if (!d) + return -ENOMEM; + + LIST_FOREACH(mount_options, i, item->mount_options) { + _cleanup_(mount_options_free_allp) MountOptions *o; + + o = new(MountOptions, 1); + if (!o) + return -ENOMEM; + + *o = (MountOptions) { + .partition_designator = i->partition_designator, + .options = strdup(i->options), + }; + if (!o->options) + return -ENOMEM; + + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + + c = reallocarray(*m, *n + 1, sizeof(MountImage)); + if (!c) + return -ENOMEM; + + *m = c; + + c[(*n) ++] = (MountImage) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .mount_options = TAKE_PTR(options), + .ignore_enoent = item->ignore_enoent, + }; + + return 0; +} + +void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) { + size_t i; + + assert(t || n == 0); + + for (i = 0; i < n; i++) { + free(t[i].path); + free(t[i].options); + } + + free(t); +} + +int temporary_filesystem_add( + TemporaryFileSystem **t, + size_t *n, + const char *path, + const char *options) { + + _cleanup_free_ char *p = NULL, *o = NULL; + TemporaryFileSystem *c; + + assert(t); + assert(n); + assert(path); + + p = strdup(path); + if (!p) + return -ENOMEM; + + if (!isempty(options)) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem)); + if (!c) + return -ENOMEM; + + *t = c; + + c[(*n) ++] = (TemporaryFileSystem) { + .path = TAKE_PTR(p), + .options = TAKE_PTR(o), + }; + + return 0; +} + +static int make_tmp_prefix(const char *prefix) { + _cleanup_free_ char *t = NULL; + int r; + + /* Don't do anything unless we know the dir is actually missing */ + r = access(prefix, F_OK); + if (r >= 0) + return 0; + if (errno != ENOENT) + return -errno; + + r = mkdir_parents(prefix, 0755); + if (r < 0) + return r; + + r = tempfn_random(prefix, NULL, &t); + if (r < 0) + return r; + + if (mkdir(t, 0777) < 0) + return -errno; + + if (chmod(t, 01777) < 0) { + r = -errno; + (void) rmdir(t); + return r; + } + + if (rename(t, prefix) < 0) { + r = -errno; + (void) rmdir(t); + return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */ + } + + return 0; + +} + +static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) { + _cleanup_free_ char *x = NULL; + _cleanup_free_ char *y = NULL; + char bid[SD_ID128_STRING_MAX]; + sd_id128_t boot_id; + bool rw = true; + int r; + + assert(id); + assert(prefix); + assert(path); + + /* We include the boot id in the directory so that after a + * reboot we can easily identify obsolete directories. */ + + r = sd_id128_get_boot(&boot_id); + if (r < 0) + return r; + + x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX"); + if (!x) + return -ENOMEM; + + r = make_tmp_prefix(prefix); + if (r < 0) + return r; + + RUN_WITH_UMASK(0077) + if (!mkdtemp(x)) { + if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno)) + rw = false; + else + return -errno; + } + + if (rw) { + y = strjoin(x, "/tmp"); + if (!y) + return -ENOMEM; + + RUN_WITH_UMASK(0000) { + if (mkdir(y, 0777 | S_ISVTX) < 0) + return -errno; + } + + r = label_fix_container(y, prefix, 0); + if (r < 0) + return r; + + if (tmp_path) + *tmp_path = TAKE_PTR(y); + } else { + /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being + * read-only. This way the service will get the EROFS result as if it was writing to the real + * file system. */ + r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500); + if (r < 0) + return r; + + r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY); + if (r < 0) + return r; + } + + *path = TAKE_PTR(x); + return 0; +} + +int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { + _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL; + _cleanup_(rmdir_and_freep) char *a_tmp = NULL; + char *b; + int r; + + assert(id); + assert(tmp_dir); + assert(var_tmp_dir); + + r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp); + if (r < 0) + return r; + + r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL); + if (r < 0) + return r; + + a_tmp = mfree(a_tmp); /* avoid rmdir */ + *tmp_dir = TAKE_PTR(a); + *var_tmp_dir = TAKE_PTR(b); + + return 0; +} + +int setup_netns(const int netns_storage_socket[static 2]) { + _cleanup_close_ int netns = -1; + int r, q; + + assert(netns_storage_socket); + assert(netns_storage_socket[0] >= 0); + assert(netns_storage_socket[1] >= 0); + + /* We use the passed socketpair as a storage buffer for our + * namespace reference fd. Whatever process runs this first + * shall create a new namespace, all others should just join + * it. To serialize that we use a file lock on the socket + * pair. + * + * It's a bit crazy, but hey, works great! */ + + if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) + return -errno; + + netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); + if (netns == -EAGAIN) { + /* Nothing stored yet, so let's create a new namespace. */ + + if (unshare(CLONE_NEWNET) < 0) { + r = -errno; + goto fail; + } + + (void) loopback_setup(); + + netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (netns < 0) { + r = -errno; + goto fail; + } + + r = 1; + + } else if (netns < 0) { + r = netns; + goto fail; + + } else { + /* Yay, found something, so let's join the namespace */ + if (setns(netns, CLONE_NEWNET) < 0) { + r = -errno; + goto fail; + } + + r = 0; + } + + q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); + if (q < 0) { + r = q; + goto fail; + } + +fail: + (void) lockf(netns_storage_socket[0], F_ULOCK, 0); + return r; +} + +int open_netns_path(const int netns_storage_socket[static 2], const char *path) { + _cleanup_close_ int netns = -1; + int q, r; + + assert(netns_storage_socket); + assert(netns_storage_socket[0] >= 0); + assert(netns_storage_socket[1] >= 0); + assert(path); + + /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in + * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a + * new anonymous netns if needed. */ + + if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) + return -errno; + + netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); + if (netns == -EAGAIN) { + /* Nothing stored yet. Open the file from the file system. */ + + netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (netns < 0) { + r = -errno; + goto fail; + } + + r = fd_is_network_ns(netns); + if (r == 0) { /* Not a netns? Refuse early. */ + r = -EINVAL; + goto fail; + } + if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */ + goto fail; + + r = 1; + + } else if (netns < 0) { + r = netns; + goto fail; + } else + r = 0; /* Already allocated */ + + q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); + if (q < 0) { + r = q; + goto fail; + } + +fail: + (void) lockf(netns_storage_socket[0], F_ULOCK, 0); + return r; +} + +bool ns_type_supported(NamespaceType type) { + const char *t, *ns_proc; + + t = namespace_type_to_string(type); + if (!t) /* Don't know how to translate this? Then it's not supported */ + return false; + + ns_proc = strjoina("/proc/self/ns/", t); + return access(ns_proc, F_OK) == 0; +} + +static const char *const protect_home_table[_PROTECT_HOME_MAX] = { + [PROTECT_HOME_NO] = "no", + [PROTECT_HOME_YES] = "yes", + [PROTECT_HOME_READ_ONLY] = "read-only", + [PROTECT_HOME_TMPFS] = "tmpfs", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES); + +static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { + [PROTECT_SYSTEM_NO] = "no", + [PROTECT_SYSTEM_YES] = "yes", + [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES); + +static const char* const namespace_type_table[] = { + [NAMESPACE_MOUNT] = "mnt", + [NAMESPACE_CGROUP] = "cgroup", + [NAMESPACE_UTS] = "uts", + [NAMESPACE_IPC] = "ipc", + [NAMESPACE_USER] = "user", + [NAMESPACE_PID] = "pid", + [NAMESPACE_NET] = "net", +}; + +DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType); + +static const char* const protect_proc_table[_PROTECT_PROC_MAX] = { + [PROTECT_PROC_DEFAULT] = "default", + [PROTECT_PROC_NOACCESS] = "noaccess", + [PROTECT_PROC_INVISIBLE] = "invisible", + [PROTECT_PROC_PTRACEABLE] = "ptraceable", +}; + +DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc); + +static const char* const proc_subset_table[_PROC_SUBSET_MAX] = { + [PROC_SUBSET_ALL] = "all", + [PROC_SUBSET_PID] = "pid", +}; + +DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset); diff --git a/src/core/namespace.h b/src/core/namespace.h new file mode 100644 index 0000000..da0861c --- /dev/null +++ b/src/core/namespace.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2016 Djalal Harouni +***/ + +typedef struct NamespaceInfo NamespaceInfo; +typedef struct BindMount BindMount; +typedef struct TemporaryFileSystem TemporaryFileSystem; +typedef struct MountImage MountImage; + +#include <stdbool.h> + +#include "dissect-image.h" +#include "fs-util.h" +#include "macro.h" +#include "string-util.h" + +typedef enum ProtectHome { + PROTECT_HOME_NO, + PROTECT_HOME_YES, + PROTECT_HOME_READ_ONLY, + PROTECT_HOME_TMPFS, + _PROTECT_HOME_MAX, + _PROTECT_HOME_INVALID = -1 +} ProtectHome; + +typedef enum NamespaceType { + NAMESPACE_MOUNT, + NAMESPACE_CGROUP, + NAMESPACE_UTS, + NAMESPACE_IPC, + NAMESPACE_USER, + NAMESPACE_PID, + NAMESPACE_NET, + _NAMESPACE_TYPE_MAX, + _NAMESPACE_TYPE_INVALID = -1, +} NamespaceType; + +typedef enum ProtectSystem { + PROTECT_SYSTEM_NO, + PROTECT_SYSTEM_YES, + PROTECT_SYSTEM_FULL, + PROTECT_SYSTEM_STRICT, + _PROTECT_SYSTEM_MAX, + _PROTECT_SYSTEM_INVALID = -1 +} ProtectSystem; + +typedef enum ProtectProc { + PROTECT_PROC_DEFAULT, + PROTECT_PROC_NOACCESS, /* hidepid=noaccess */ + PROTECT_PROC_INVISIBLE, /* hidepid=invisible */ + PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */ + _PROTECT_PROC_MAX, + _PROTECT_PROC_INVALID = -1, +} ProtectProc; + +typedef enum ProcSubset { + PROC_SUBSET_ALL, + PROC_SUBSET_PID, /* subset=pid */ + _PROC_SUBSET_MAX, + _PROC_SUBSET_INVALID = -1, +} ProcSubset; + +struct NamespaceInfo { + bool ignore_protect_paths; + bool private_dev; + bool private_mounts; + bool protect_control_groups; + bool protect_kernel_tunables; + bool protect_kernel_modules; + bool protect_kernel_logs; + bool mount_apivfs; + bool protect_hostname; + ProtectHome protect_home; + ProtectSystem protect_system; + ProtectProc protect_proc; + ProcSubset proc_subset; +}; + +struct BindMount { + char *source; + char *destination; + bool read_only; + bool nosuid; + bool recursive; + bool ignore_enoent; +}; + +struct TemporaryFileSystem { + char *path; + char *options; +}; + +struct MountImage { + char *source; + char *destination; + LIST_HEAD(MountOptions, mount_options); + bool ignore_enoent; +}; + +int setup_namespace( + const char *root_directory, + const char *root_image, + const MountOptions *root_image_options, + const NamespaceInfo *ns_info, + char **read_write_paths, + char **read_only_paths, + char **inaccessible_paths, + char **empty_directories, + const BindMount *bind_mounts, + size_t n_bind_mounts, + const TemporaryFileSystem *temporary_filesystems, + size_t n_temporary_filesystems, + const MountImage *mount_images, + size_t n_mount_images, + const char *tmp_dir, + const char *var_tmp_dir, + const char *creds_path, + const char *log_namespace, + unsigned long mount_flags, + const void *root_hash, + size_t root_hash_size, + const char *root_hash_path, + const void *root_hash_sig, + size_t root_hash_sig_size, + const char *root_hash_sig_path, + const char *root_verity, + DissectImageFlags dissected_image_flags, + char **error_path); + +#define RUN_SYSTEMD_EMPTY "/run/systemd/empty" + +static inline void namespace_cleanup_tmpdir(char *p) { + PROTECT_ERRNO; + if (!streq_ptr(p, RUN_SYSTEMD_EMPTY)) + (void) rmdir(p); + free(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, namespace_cleanup_tmpdir); + +int setup_tmp_dirs( + const char *id, + char **tmp_dir, + char **var_tmp_dir); + +int setup_netns(const int netns_storage_socket[static 2]); +int open_netns_path(const int netns_storage_socket[static 2], const char *path); + +const char* protect_home_to_string(ProtectHome p) _const_; +ProtectHome protect_home_from_string(const char *s) _pure_; + +const char* protect_system_to_string(ProtectSystem p) _const_; +ProtectSystem protect_system_from_string(const char *s) _pure_; + +const char* protect_proc_to_string(ProtectProc i) _const_; +ProtectProc protect_proc_from_string(const char *s) _pure_; + +const char* proc_subset_to_string(ProcSubset i) _const_; +ProcSubset proc_subset_from_string(const char *s) _pure_; + +void bind_mount_free_many(BindMount *b, size_t n); +int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); + +void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n); +int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n, + const char *path, const char *options); + +MountImage* mount_image_free_many(MountImage *m, size_t *n); +int mount_image_add(MountImage **m, size_t *n, const MountImage *item); + +const char* namespace_type_to_string(NamespaceType t) _const_; +NamespaceType namespace_type_from_string(const char *s) _pure_; + +bool ns_type_supported(NamespaceType type); diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf new file mode 100644 index 0000000..8b32379 --- /dev/null +++ b/src/core/org.freedesktop.systemd1.conf @@ -0,0 +1,404 @@ +<?xml version="1.0"?> <!--*-nxml-*--> +<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN" + "http://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd"> + +<!-- + SPDX-License-Identifier: LGPL-2.1-or-later + + This file is part of systemd. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +--> + +<busconfig> + + <policy user="root"> + <allow own="org.freedesktop.systemd1"/> + + <!-- Root clients can do everything --> + <allow send_destination="org.freedesktop.systemd1"/> + <allow receive_sender="org.freedesktop.systemd1"/> + + <!-- systemd may receive activator requests --> + <allow receive_interface="org.freedesktop.systemd1.Activator" + receive_member="ActivationRequest"/> + </policy> + + <policy context="default"> + <deny send_destination="org.freedesktop.systemd1"/> + + <!-- Completely open to anyone: org.freedesktop.DBus.* interfaces --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Introspectable"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Peer"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Properties" + send_member="Get"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.DBus.Properties" + send_member="GetAll"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Manager interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitByPID"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitByInvocationID"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitByControlGroup"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LoadUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitProcesses"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetJob"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetJobAfter"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetJobBefore"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnits"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitsFiltered"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitsByPatterns"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitsByNames"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListJobs"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Subscribe"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Unsubscribe"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Dump"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="DumpByFileDescriptor"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ListUnitFilesByPatterns"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitFileState"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetDefaultTarget"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetUnitFileLinks"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LookupDynamicUserByName"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LookupDynamicUserByUID"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="GetDynamicUsers"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Unit interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Service" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Slice interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Slice" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Scope interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Scope" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Socket interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Socket" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Mount interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Mount" + send_member="GetProcesses"/> + + <!-- Completely open to anyone: org.freedesktop.systemd1.Swap interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Swap" + send_member="GetProcesses"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Manager interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StartUnitReplace"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StopUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReloadUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="RestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="TryRestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReloadOrRestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReloadOrTryRestartUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="KillUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ResetFailedUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="SetUnitProperties"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="RefUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="UnrefUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="StartTransientUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="AttachProcessesToUnit"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="CancelJob"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ClearJobs"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ResetFailed"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Reload"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="Reexecute"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="EnableUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="DisableUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="ReenableUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="LinkUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="PresetUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="PresetUnitFilesWithMode"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="MaskUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="UnmaskUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="RevertUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="SetDefaultTarget"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="PresetAllUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="AddDependencyUnitFiles"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Manager" + send_member="SetShowStatus"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Job interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Job" + send_member="Cancel"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Job" + send_member="GetAfter"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Job" + send_member="GetBefore"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Unit interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Start"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Stop"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Reload"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Restart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="TryRestart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="ReloadOrRestart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="ReloadOrTryRestart"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Kill"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="ResetFailed"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="SetProperties"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Ref"/> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Unit" + send_member="Unref"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Service interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Service" + send_member="AttachProcesses"/> + + <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Scope interface --> + + <allow send_destination="org.freedesktop.systemd1" + send_interface="org.freedesktop.systemd1.Scope" + send_member="AttachProcesses"/> + + <allow receive_sender="org.freedesktop.systemd1"/> + </policy> + +</busconfig> diff --git a/src/core/org.freedesktop.systemd1.policy.in b/src/core/org.freedesktop.systemd1.policy.in new file mode 100644 index 0000000..a6d40d7 --- /dev/null +++ b/src/core/org.freedesktop.systemd1.policy.in @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*--> +<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN" + "http://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd"> + +<!-- + SPDX-License-Identifier: LGPL-2.1-or-later + + This file is part of systemd. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +--> + +<policyconfig> + + <vendor>The systemd Project</vendor> + <vendor_url>http://www.freedesktop.org/wiki/Software/systemd</vendor_url> + + <action id="org.freedesktop.systemd1.reply-password"> + <description gettext-domain="systemd">Send passphrase back to system</description> + <message gettext-domain="systemd">Authentication is required to send the entered passphrase back to the system.</message> + <defaults> + <allow_any>no</allow_any> + <allow_inactive>no</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + <annotate key="org.freedesktop.policykit.exec.path">@rootlibexecdir@/systemd-reply-password</annotate> + </action> + + <action id="org.freedesktop.systemd1.manage-units"> + <description gettext-domain="systemd">Manage system services or other units</description> + <message gettext-domain="systemd">Authentication is required to manage system services or other units.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + </action> + + <action id="org.freedesktop.systemd1.manage-unit-files"> + <description gettext-domain="systemd">Manage system service or unit files</description> + <message gettext-domain="systemd">Authentication is required to manage system service or unit files.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + <annotate key="org.freedesktop.policykit.imply">org.freedesktop.systemd1.reload-daemon org.freedesktop.systemd1.manage-units</annotate> + </action> + + <action id="org.freedesktop.systemd1.set-environment"> + <description gettext-domain="systemd">Set or unset system and service manager environment variables</description> + <message gettext-domain="systemd">Authentication is required to set or unset system and service manager environment variables.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + </action> + + <action id="org.freedesktop.systemd1.reload-daemon"> + <description gettext-domain="systemd">Reload the systemd state</description> + <message gettext-domain="systemd">Authentication is required to reload the systemd state.</message> + <defaults> + <allow_any>auth_admin</allow_any> + <allow_inactive>auth_admin</allow_inactive> + <allow_active>auth_admin_keep</allow_active> + </defaults> + </action> + +</policyconfig> diff --git a/src/core/org.freedesktop.systemd1.service b/src/core/org.freedesktop.systemd1.service new file mode 100644 index 0000000..082125f --- /dev/null +++ b/src/core/org.freedesktop.systemd1.service @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.systemd1 +Exec=/bin/false +User=root diff --git a/src/core/path.c b/src/core/path.c new file mode 100644 index 0000000..ca3a91d --- /dev/null +++ b/src/core/path.c @@ -0,0 +1,846 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <sys/epoll.h> +#include <sys/inotify.h> +#include <unistd.h> + +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-path.h" +#include "dbus-unit.h" +#include "escape.h" +#include "fd-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "macro.h" +#include "mkdir.h" +#include "path.h" +#include "path-util.h" +#include "serialize.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_PATH_STATE_MAX] = { + [PATH_DEAD] = UNIT_INACTIVE, + [PATH_WAITING] = UNIT_ACTIVE, + [PATH_RUNNING] = UNIT_ACTIVE, + [PATH_FAILED] = UNIT_FAILED, +}; + +static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); + +int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) { + static const int flags_table[_PATH_TYPE_MAX] = { + [PATH_EXISTS] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB, + [PATH_EXISTS_GLOB] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB, + [PATH_CHANGED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO, + [PATH_MODIFIED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO|IN_MODIFY, + [PATH_DIRECTORY_NOT_EMPTY] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO, + }; + + bool exists = false; + char *slash, *oldslash = NULL; + int r; + + assert(s); + assert(s->unit); + assert(handler); + + path_spec_unwatch(s); + + s->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (s->inotify_fd < 0) { + r = -errno; + goto fail; + } + + r = sd_event_add_io(s->unit->manager->event, &s->event_source, s->inotify_fd, EPOLLIN, handler, s); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(s->event_source, "path"); + + /* This function assumes the path was passed through path_simplify()! */ + assert(!strstr(s->path, "//")); + + for (slash = strchr(s->path, '/'); ; slash = strchr(slash+1, '/')) { + char *cut = NULL; + int flags; + char tmp; + + if (slash) { + cut = slash + (slash == s->path); + tmp = *cut; + *cut = '\0'; + + flags = IN_MOVE_SELF | IN_DELETE_SELF | IN_ATTRIB | IN_CREATE | IN_MOVED_TO; + } else + flags = flags_table[s->type]; + + r = inotify_add_watch(s->inotify_fd, s->path, flags); + if (r < 0) { + if (IN_SET(errno, EACCES, ENOENT)) { + if (cut) + *cut = tmp; + break; + } + + /* This second call to inotify_add_watch() should fail like the previous + * one and is done for logging the error in a comprehensive way. */ + r = inotify_add_watch_and_warn(s->inotify_fd, s->path, flags); + if (r < 0) { + if (cut) + *cut = tmp; + goto fail; + } + + /* Hmm, we succeeded in adding the watch this time... let's continue. */ + } + exists = true; + + /* Path exists, we don't need to watch parent too closely. */ + if (oldslash) { + char *cut2 = oldslash + (oldslash == s->path); + char tmp2 = *cut2; + *cut2 = '\0'; + + (void) inotify_add_watch(s->inotify_fd, s->path, IN_MOVE_SELF); + /* Error is ignored, the worst can happen is we get spurious events. */ + + *cut2 = tmp2; + } + + if (cut) + *cut = tmp; + + if (slash) + oldslash = slash; + else { + /* whole path has been iterated over */ + s->primary_wd = r; + break; + } + } + + if (!exists) { + r = log_error_errno(errno, "Failed to add watch on any of the components of %s: %m", s->path); + /* either EACCESS or ENOENT */ + goto fail; + } + + return 0; + +fail: + path_spec_unwatch(s); + return r; +} + +void path_spec_unwatch(PathSpec *s) { + assert(s); + + s->event_source = sd_event_source_unref(s->event_source); + s->inotify_fd = safe_close(s->inotify_fd); +} + +int path_spec_fd_event(PathSpec *s, uint32_t revents) { + union inotify_event_buffer buffer; + struct inotify_event *e; + ssize_t l; + int r = 0; + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Got invalid poll event on inotify."); + + l = read(s->inotify_fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (IN_SET(errno, EAGAIN, EINTR)) + return 0; + + return log_error_errno(errno, "Failed to read inotify event: %m"); + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + if (IN_SET(s->type, PATH_CHANGED, PATH_MODIFIED) && + s->primary_wd == e->wd) + r = 1; + } + + return r; +} + +static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_notify) { + bool b, good = false; + + switch (s->type) { + + case PATH_EXISTS: + good = access(s->path, F_OK) >= 0; + break; + + case PATH_EXISTS_GLOB: + good = glob_exists(s->path) > 0; + break; + + case PATH_DIRECTORY_NOT_EMPTY: { + int k; + + k = dir_is_empty(s->path); + good = !(k == -ENOENT || k > 0); + break; + } + + case PATH_CHANGED: + case PATH_MODIFIED: + b = access(s->path, F_OK) >= 0; + good = !initial && !from_trigger_notify && b != s->previous_exists; + s->previous_exists = b; + break; + + default: + ; + } + + return good; +} + +static void path_spec_mkdir(PathSpec *s, mode_t mode) { + int r; + + if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB)) + return; + + r = mkdir_p_label(s->path, mode); + if (r < 0) + log_warning_errno(r, "mkdir(%s) failed: %m", s->path); +} + +static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) { + const char *type; + + assert_se(type = path_type_to_string(s->type)); + fprintf(f, "%s%s: %s\n", prefix, type, s->path); +} + +void path_spec_done(PathSpec *s) { + assert(s); + assert(s->inotify_fd == -1); + + free(s->path); +} + +static void path_init(Unit *u) { + Path *p = PATH(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + p->directory_mode = 0755; +} + +void path_free_specs(Path *p) { + PathSpec *s; + + assert(p); + + while ((s = p->specs)) { + path_spec_unwatch(s); + LIST_REMOVE(spec, p->specs, s); + path_spec_done(s); + free(s); + } +} + +static void path_done(Unit *u) { + Path *p = PATH(u); + + assert(p); + + path_free_specs(p); +} + +static int path_add_mount_dependencies(Path *p) { + PathSpec *s; + int r; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + return 0; +} + +static int path_verify(Path *p) { + assert(p); + assert(UNIT(p)->load_state == UNIT_LOADED); + + if (!p->specs) { + log_unit_error(UNIT(p), "Path unit lacks path setting. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int path_add_default_dependencies(Path *p) { + int r; + + assert(p); + + if (!UNIT(p)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(p), UNIT_BEFORE, SPECIAL_PATHS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(p)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(p), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(p), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int path_add_trigger_dependencies(Path *p) { + Unit *x; + int r; + + assert(p); + + if (!hashmap_isempty(UNIT(p)->dependencies[UNIT_TRIGGERS])) + return 0; + + r = unit_load_related_unit(UNIT(p), ".service", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(p), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int path_add_extras(Path *p) { + int r; + + r = path_add_trigger_dependencies(p); + if (r < 0) + return r; + + r = path_add_mount_dependencies(p); + if (r < 0) + return r; + + return path_add_default_dependencies(p); +} + +static int path_load(Unit *u) { + Path *p = PATH(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + r = path_add_extras(p); + if (r < 0) + return r; + + return path_verify(p); +} + +static void path_dump(Unit *u, FILE *f, const char *prefix) { + Path *p = PATH(u); + Unit *trigger; + PathSpec *s; + + assert(p); + assert(f); + + trigger = UNIT_TRIGGER(u); + + fprintf(f, + "%sPath State: %s\n" + "%sResult: %s\n" + "%sUnit: %s\n" + "%sMakeDirectory: %s\n" + "%sDirectoryMode: %04o\n", + prefix, path_state_to_string(p->state), + prefix, path_result_to_string(p->result), + prefix, trigger ? trigger->id : "n/a", + prefix, yes_no(p->make_directory), + prefix, p->directory_mode); + + LIST_FOREACH(spec, s, p->specs) + path_spec_dump(s, f, prefix); +} + +static void path_unwatch(Path *p) { + PathSpec *s; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) + path_spec_unwatch(s); +} + +static int path_watch(Path *p) { + int r; + PathSpec *s; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) { + r = path_spec_watch(s, path_dispatch_io); + if (r < 0) + return r; + } + + return 0; +} + +static void path_set_state(Path *p, PathState state) { + PathState old_state; + assert(p); + + if (p->state != state) + bus_unit_send_pending_change_signal(UNIT(p), false); + + old_state = p->state; + p->state = state; + + if (!IN_SET(state, PATH_WAITING, PATH_RUNNING)) + path_unwatch(p); + + if (state != old_state) + log_unit_debug(UNIT(p), "Changed %s -> %s", path_state_to_string(old_state), path_state_to_string(state)); + + unit_notify(UNIT(p), state_translation_table[old_state], state_translation_table[state], 0); +} + +static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify); + +static int path_coldplug(Unit *u) { + Path *p = PATH(u); + + assert(p); + assert(p->state == PATH_DEAD); + + if (p->deserialized_state != p->state) { + + if (IN_SET(p->deserialized_state, PATH_WAITING, PATH_RUNNING)) + path_enter_waiting(p, true, false); + else + path_set_state(p, p->deserialized_state); + } + + return 0; +} + +static void path_enter_dead(Path *p, PathResult f) { + assert(p); + + if (p->result == PATH_SUCCESS) + p->result = f; + + unit_log_result(UNIT(p), p->result == PATH_SUCCESS, path_result_to_string(p->result)); + path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD); +} + +static void path_enter_running(Path *p) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + int r; + + assert(p); + + /* Don't start job if we are supposed to go down */ + if (unit_stop_pending(UNIT(p))) + return; + + trigger = UNIT_TRIGGER(UNIT(p)); + if (!trigger) { + log_unit_error(UNIT(p), "Unit to trigger vanished."); + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return; + } + + r = manager_add_job(UNIT(p)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto fail; + + path_set_state(p, PATH_RUNNING); + path_unwatch(p); + + return; + +fail: + log_unit_warning(UNIT(p), "Failed to queue unit startup job: %s", bus_error_message(&error, r)); + path_enter_dead(p, PATH_FAILURE_RESOURCES); +} + +static bool path_check_good(Path *p, bool initial, bool from_trigger_notify) { + PathSpec *s; + + assert(p); + + LIST_FOREACH(spec, s, p->specs) + if (path_spec_check_good(s, initial, from_trigger_notify)) + return true; + + return false; +} + +static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify) { + Unit *trigger; + int r; + + /* If the triggered unit is already running, so are we */ + trigger = UNIT_TRIGGER(UNIT(p)); + if (trigger && !UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger))) { + path_set_state(p, PATH_RUNNING); + path_unwatch(p); + return; + } + + if (path_check_good(p, initial, from_trigger_notify)) { + log_unit_debug(UNIT(p), "Got triggered."); + path_enter_running(p); + return; + } + + r = path_watch(p); + if (r < 0) + goto fail; + + /* Hmm, so now we have created inotify watches, but the file + * might have appeared/been removed by now, so we must + * recheck */ + + if (path_check_good(p, false, from_trigger_notify)) { + log_unit_debug(UNIT(p), "Got triggered."); + path_enter_running(p); + return; + } + + path_set_state(p, PATH_WAITING); + return; + +fail: + log_unit_warning_errno(UNIT(p), r, "Failed to enter waiting state: %m"); + path_enter_dead(p, PATH_FAILURE_RESOURCES); +} + +static void path_mkdir(Path *p) { + PathSpec *s; + + assert(p); + + if (!p->make_directory) + return; + + LIST_FOREACH(spec, s, p->specs) + path_spec_mkdir(s, p->directory_mode); +} + +static int path_start(Unit *u) { + Path *p = PATH(u); + int r; + + assert(p); + assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED)); + + r = unit_test_trigger_loaded(u); + if (r < 0) + return r; + + r = unit_test_start_limit(u); + if (r < 0) { + path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + path_mkdir(p); + + p->result = PATH_SUCCESS; + path_enter_waiting(p, true, false); + + return 1; +} + +static int path_stop(Unit *u) { + Path *p = PATH(u); + + assert(p); + assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING)); + + path_enter_dead(p, PATH_SUCCESS); + return 1; +} + +static int path_serialize(Unit *u, FILE *f, FDSet *fds) { + Path *p = PATH(u); + PathSpec *s; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", path_state_to_string(p->state)); + (void) serialize_item(f, "result", path_result_to_string(p->result)); + + LIST_FOREACH(spec, s, p->specs) { + const char *type; + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->path); + if (!escaped) + return log_oom(); + + assert_se(type = path_type_to_string(s->type)); + (void) serialize_item_format(f, "path-spec", "%s %i %s", + type, + s->previous_exists, + escaped); + } + + return 0; +} + +static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Path *p = PATH(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + PathState state; + + state = path_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + p->deserialized_state = state; + + } else if (streq(key, "result")) { + PathResult f; + + f = path_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != PATH_SUCCESS) + p->result = f; + + } else if (streq(key, "path-spec")) { + int previous_exists, skip = 0, r; + _cleanup_free_ char *type_str = NULL; + + if (sscanf(value, "%ms %i %n", &type_str, &previous_exists, &skip) < 2) + log_unit_debug(u, "Failed to parse path-spec value: %s", value); + else { + _cleanup_free_ char *unescaped = NULL; + PathType type; + PathSpec *s; + + type = path_type_from_string(type_str); + if (type < 0) { + log_unit_warning(u, "Unknown path type \"%s\", ignoring.", type_str); + return 0; + } + + r = cunescape(value+skip, 0, &unescaped); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to unescape serialize path: %m"); + return 0; + } + + LIST_FOREACH(spec, s, p->specs) + if (s->type == type && + path_equal(s->path, unescaped)) { + + s->previous_exists = previous_exists; + break; + } + } + + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState path_active_state(Unit *u) { + assert(u); + + return state_translation_table[PATH(u)->state]; +} + +_pure_ static const char *path_sub_state_to_string(Unit *u) { + assert(u); + + return path_state_to_string(PATH(u)->state); +} + +static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + PathSpec *s = userdata; + Path *p; + int changed; + + assert(s); + assert(s->unit); + assert(fd >= 0); + + p = PATH(s->unit); + + if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING)) + return 0; + + /* log_debug("inotify wakeup on %s.", UNIT(p)->id); */ + + LIST_FOREACH(spec, s, p->specs) + if (path_spec_owns_inotify_fd(s, fd)) + break; + + if (!s) { + log_error("Got event on unknown fd."); + goto fail; + } + + changed = path_spec_fd_event(s, revents); + if (changed < 0) + goto fail; + + if (changed) + path_enter_running(p); + else + path_enter_waiting(p, false, false); + + return 0; + +fail: + path_enter_dead(p, PATH_FAILURE_RESOURCES); + return 0; +} + +static void path_trigger_notify(Unit *u, Unit *other) { + Path *p = PATH(u); + + assert(u); + assert(other); + + /* Invoked whenever the unit we trigger changes state or gains or loses a job */ + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + + /* Don't propagate state changes from the triggered unit if we are already down */ + if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING)) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + path_enter_dead(p, PATH_FAILURE_UNIT_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + if (p->state == PATH_RUNNING && + UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + log_unit_debug(UNIT(p), "Got notified about unit deactivation."); + path_enter_waiting(p, false, true); + } else if (p->state == PATH_WAITING && + !UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + log_unit_debug(UNIT(p), "Got notified about unit activation."); + path_enter_waiting(p, false, true); + } +} + +static void path_reset_failed(Unit *u) { + Path *p = PATH(u); + + assert(p); + + if (p->state == PATH_FAILED) + path_set_state(p, PATH_DEAD); + + p->result = PATH_SUCCESS; +} + +static const char* const path_type_table[_PATH_TYPE_MAX] = { + [PATH_EXISTS] = "PathExists", + [PATH_EXISTS_GLOB] = "PathExistsGlob", + [PATH_DIRECTORY_NOT_EMPTY] = "DirectoryNotEmpty", + [PATH_CHANGED] = "PathChanged", + [PATH_MODIFIED] = "PathModified", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_type, PathType); + +static const char* const path_result_table[_PATH_RESULT_MAX] = { + [PATH_SUCCESS] = "success", + [PATH_FAILURE_RESOURCES] = "resources", + [PATH_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [PATH_FAILURE_UNIT_START_LIMIT_HIT] = "unit-start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_result, PathResult); + +const UnitVTable path_vtable = { + .object_size = sizeof(Path), + + .sections = + "Unit\0" + "Path\0" + "Install\0", + .private_section = "Path", + + .can_transient = true, + .can_fail = true, + .can_trigger = true, + + .init = path_init, + .done = path_done, + .load = path_load, + + .coldplug = path_coldplug, + + .dump = path_dump, + + .start = path_start, + .stop = path_stop, + + .serialize = path_serialize, + .deserialize_item = path_deserialize_item, + + .active_state = path_active_state, + .sub_state_to_string = path_sub_state_to_string, + + .trigger_notify = path_trigger_notify, + + .reset_failed = path_reset_failed, + + .bus_set_property = bus_path_set_property, +}; diff --git a/src/core/path.h b/src/core/path.h new file mode 100644 index 0000000..fb33b12 --- /dev/null +++ b/src/core/path.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Path Path; +typedef struct PathSpec PathSpec; + +#include "unit.h" + +typedef enum PathType { + PATH_EXISTS, + PATH_EXISTS_GLOB, + PATH_DIRECTORY_NOT_EMPTY, + PATH_CHANGED, + PATH_MODIFIED, + _PATH_TYPE_MAX, + _PATH_TYPE_INVALID = -1 +} PathType; + +typedef struct PathSpec { + Unit *unit; + + char *path; + + sd_event_source *event_source; + + LIST_FIELDS(struct PathSpec, spec); + + PathType type; + int inotify_fd; + int primary_wd; + + bool previous_exists; +} PathSpec; + +int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler); +void path_spec_unwatch(PathSpec *s); +int path_spec_fd_event(PathSpec *s, uint32_t events); +void path_spec_done(PathSpec *s); + +static inline bool path_spec_owns_inotify_fd(PathSpec *s, int fd) { + return s->inotify_fd == fd; +} + +typedef enum PathResult { + PATH_SUCCESS, + PATH_FAILURE_RESOURCES, + PATH_FAILURE_START_LIMIT_HIT, + PATH_FAILURE_UNIT_START_LIMIT_HIT, + _PATH_RESULT_MAX, + _PATH_RESULT_INVALID = -1 +} PathResult; + +struct Path { + Unit meta; + + LIST_HEAD(PathSpec, specs); + + PathState state, deserialized_state; + + bool make_directory; + mode_t directory_mode; + + PathResult result; +}; + +void path_free_specs(Path *p); + +extern const UnitVTable path_vtable; + +const char* path_type_to_string(PathType i) _const_; +PathType path_type_from_string(const char *s) _pure_; + +const char* path_result_to_string(PathResult i) _const_; +PathResult path_result_from_string(const char *s) _pure_; + +DEFINE_CAST(PATH, Path); diff --git a/src/core/scope.c b/src/core/scope.c new file mode 100644 index 0000000..5448d44 --- /dev/null +++ b/src/core/scope.c @@ -0,0 +1,700 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "dbus-scope.h" +#include "dbus-unit.h" +#include "load-dropin.h" +#include "log.h" +#include "process-util.h" +#include "scope.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = { + [SCOPE_DEAD] = UNIT_INACTIVE, + [SCOPE_RUNNING] = UNIT_ACTIVE, + [SCOPE_ABANDONED] = UNIT_ACTIVE, + [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SCOPE_FAILED] = UNIT_FAILED +}; + +static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); + +static void scope_init(Unit *u) { + Scope *s = SCOPE(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->runtime_max_usec = USEC_INFINITY; + s->timeout_stop_usec = u->manager->default_timeout_stop_usec; + u->ignore_on_isolate = true; +} + +static void scope_done(Unit *u) { + Scope *s = SCOPE(u); + + assert(u); + + s->controller = mfree(s->controller); + s->controller_track = sd_bus_track_unref(s->controller_track); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); +} + +static int scope_arm_timer(Scope *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + scope_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "scope-timer"); + + return 0; +} + +static void scope_set_state(Scope *s, ScopeState state) { + ScopeState old_state; + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) { + unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } + + if (state != old_state) + log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int scope_add_default_dependencies(Scope *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Make sure scopes are unloaded on shutdown */ + r = unit_add_two_dependencies_by_name( + UNIT(s), + UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_SHUTDOWN_TARGET, true, + UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int scope_verify(Scope *s) { + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (set_isempty(UNIT(s)->pids) && + !MANAGER_IS_RELOADING(UNIT(s)->manager) && + !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) { + log_unit_error(UNIT(s), "Scope has no PIDs. Refusing."); + return -ENOENT; + } + + return 0; +} + +static int scope_load_init_scope(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_INIT_SCOPE)) + return 0; + + u->transient = true; + u->perpetual = true; + + /* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we + * synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + /* Prettify things, if we can. */ + if (!u->description) + u->description = strdup("System and Service Manager"); + if (!u->documentation) + (void) strv_extend(&u->documentation, "man:systemd(1)"); + + return 1; +} + +static int scope_add_extras(Scope *s) { + int r; + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + return scope_add_default_dependencies(s); +} + +static int scope_load(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(u->load_state == UNIT_STUB); + + if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) + /* Refuse to load non-transient scope units, but allow them while reloading. */ + return -ENOENT; + + r = scope_load_init_scope(u); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin(u, false); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + r = scope_add_extras(s); + if (r < 0) + return r; + + return scope_verify(s); +} + +static usec_t scope_coldplug_timeout(Scope *s) { + assert(s); + + switch (s->deserialized_state) { + + case SCOPE_RUNNING: + return usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec); + + case SCOPE_STOP_SIGKILL: + case SCOPE_STOP_SIGTERM: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec); + + default: + return USEC_INFINITY; + } +} + +static int scope_coldplug(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + assert(s->state == SCOPE_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + r = scope_arm_timer(s, scope_coldplug_timeout(s)); + if (r < 0) + return r; + + if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) { + if (u->pids) { + void *pidp; + + SET_FOREACH(pidp, u->pids) { + r = unit_watch_pid(u, PTR_TO_PID(pidp), false); + if (r < 0 && r != -EEXIST) + return r; + } + } else + (void) unit_enqueue_rewatch_pids(u); + } + + bus_scope_track_controller(s); + + scope_set_state(s, s->deserialized_state); + return 0; +} + +static void scope_dump(Unit *u, FILE *f, const char *prefix) { + Scope *s = SCOPE(u); + char buf_runtime[FORMAT_TIMESPAN_MAX]; + + assert(s); + assert(f); + + fprintf(f, + "%sScope State: %s\n" + "%sResult: %s\n" + "%sRuntimeMaxSec: %s\n", + prefix, scope_state_to_string(s->state), + prefix, scope_result_to_string(s->result), + prefix, format_timespan(buf_runtime, sizeof(buf_runtime), s->runtime_max_usec, USEC_PER_SEC)); + + cgroup_context_dump(UNIT(s), f, prefix); + kill_context_dump(&s->kill_context, f, prefix); +} + +static void scope_enter_dead(Scope *s, ScopeResult f) { + assert(s); + + if (s->result == SCOPE_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result)); + scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD); +} + +static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) { + bool skip_signal = false; + int r; + + assert(s); + + if (s->result == SCOPE_SUCCESS) + s->result = f; + + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going + * directly into SIGTERM berserk mode */ + if (state == SCOPE_STOP_SIGTERM) + skip_signal = bus_scope_send_request_stop(s) > 0; + + if (skip_signal) + r = 1; /* wait */ + else { + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state != SCOPE_STOP_SIGTERM ? KILL_KILL : + s->was_abandoned ? KILL_TERMINATE_AND_LOG : + KILL_TERMINATE, + -1, -1, false); + if (r < 0) + goto fail; + } + + if (r > 0) { + r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec)); + if (r < 0) + goto fail; + + scope_set_state(s, state); + } else if (state == SCOPE_STOP_SIGTERM) + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS); + else + scope_enter_dead(s, SCOPE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); +} + +static int scope_start(Unit *u) { + Scope *s = SCOPE(u); + int r; + + assert(s); + + if (unit_has_name(u, SPECIAL_INIT_SCOPE)) + return -EPERM; + + if (s->state == SCOPE_FAILED) + return -EPERM; + + /* We can't fulfill this right now, please try again later */ + if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + return -EAGAIN; + + assert(s->state == SCOPE_DEAD); + + if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) + return -ENOENT; + + (void) bus_scope_track_controller(s); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + (void) unit_realize_cgroup(u); + (void) unit_reset_accounting(u); + + unit_export_state_files(u); + + r = unit_attach_pids_to_cgroup(u, u->pids, NULL); + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to add PIDs to scope's control group: %m"); + scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); + return r; + } + + s->result = SCOPE_SUCCESS; + + scope_set_state(s, SCOPE_RUNNING); + + /* Set the maximum runtime timeout. */ + scope_arm_timer(s, usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec)); + + /* On unified we use proper notifications hence we can unwatch the PIDs + * we just attached to the scope. This can also be done on legacy as + * we're going to update the list of the processes we watch with the + * PIDs currently in the scope anyway. */ + unit_unwatch_all_pids(u); + + /* Start watching the PIDs currently in the scope (legacy hierarchy only) */ + (void) unit_enqueue_rewatch_pids(u); + return 1; +} + +static int scope_stop(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + return 0; + + assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)); + + scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS); + return 1; +} + +static void scope_reset_failed(Unit *u) { + Scope *s = SCOPE(u); + + assert(s); + + if (s->state == SCOPE_FAILED) + scope_set_state(s, SCOPE_DEAD); + + s->result = SCOPE_SUCCESS; +} + +static int scope_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, -1, error); +} + +static int scope_get_timeout(Unit *u, usec_t *timeout) { + Scope *s = SCOPE(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static int scope_serialize(Unit *u, FILE *f, FDSet *fds) { + Scope *s = SCOPE(u); + void *pidp; + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", scope_state_to_string(s->state)); + (void) serialize_bool(f, "was-abandoned", s->was_abandoned); + + if (s->controller) + (void) serialize_item(f, "controller", s->controller); + + SET_FOREACH(pidp, u->pids) + serialize_item_format(f, "pids", PID_FMT, PTR_TO_PID(pidp)); + + return 0; +} + +static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Scope *s = SCOPE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + ScopeState state; + + state = scope_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + + } else if (streq(key, "was-abandoned")) { + int k; + + k = parse_boolean(value); + if (k < 0) + log_unit_debug(u, "Failed to parse boolean value: %s", value); + else + s->was_abandoned = k; + } else if (streq(key, "controller")) { + + r = free_and_strdup(&s->controller, value); + if (r < 0) + return log_oom(); + + } else if (streq(key, "pids")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse pids value: %s", value); + else { + r = set_ensure_allocated(&u->pids, NULL); + if (r < 0) + return r; + + r = set_put(u->pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void scope_notify_cgroup_empty_event(Unit *u) { + Scope *s = SCOPE(u); + assert(u); + + log_unit_debug(u, "cgroup is empty"); + + if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + scope_enter_dead(s, SCOPE_SUCCESS); + + /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean + * up the cgroup earlier and should do it now. */ + if (IN_SET(s->state, SCOPE_DEAD, SCOPE_FAILED)) + unit_prune_cgroup(u); +} + +static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { + assert(u); + + /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to + * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original + * process we watched was probably the parent of them, and they are hence now our children. */ + + (void) unit_enqueue_rewatch_pids(u); +} + +static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Scope *s = SCOPE(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SCOPE_RUNNING: + log_unit_warning(UNIT(s), "Scope reached runtime time limit. Stopping."); + scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_TIMEOUT); + break; + + case SCOPE_STOP_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + } + + break; + + case SCOPE_STOP_SIGKILL: + log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring."); + scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +int scope_abandon(Scope *s) { + assert(s); + + if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) + return -EPERM; + + if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)) + return -ESTALE; + + s->was_abandoned = true; + + s->controller = mfree(s->controller); + s->controller_track = sd_bus_track_unref(s->controller_track); + + scope_set_state(s, SCOPE_ABANDONED); + + /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that + * the remaining processes will be sooner or later reassigned to us as parent. */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + return 0; +} + +_pure_ static UnitActiveState scope_active_state(Unit *u) { + assert(u); + + return state_translation_table[SCOPE(u)->state]; +} + +_pure_ static const char *scope_sub_state_to_string(Unit *u) { + assert(u); + + return scope_state_to_string(SCOPE(u)->state); +} + +static void scope_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + /* Let's unconditionally add the "init.scope" special unit + * that encapsulates PID 1. Note that PID 1 already is in the + * cgroup for this, we hence just need to allocate the object + * for it and that's it. */ + + u = manager_get_unit(m, SPECIAL_INIT_SCOPE); + if (!u) { + r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u); + if (r < 0) { + log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m"); + return; + } + } + + u->transient = true; + u->perpetual = true; + SCOPE(u)->deserialized_state = SCOPE_RUNNING; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); +} + +static const char* const scope_result_table[_SCOPE_RESULT_MAX] = { + [SCOPE_SUCCESS] = "success", + [SCOPE_FAILURE_RESOURCES] = "resources", + [SCOPE_FAILURE_TIMEOUT] = "timeout", +}; + +DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult); + +const UnitVTable scope_vtable = { + .object_size = sizeof(Scope), + .cgroup_context_offset = offsetof(Scope, cgroup_context), + .kill_context_offset = offsetof(Scope, kill_context), + + .sections = + "Unit\0" + "Scope\0" + "Install\0", + .private_section = "Scope", + + .can_transient = true, + .can_delegate = true, + .can_fail = true, + .once_only = true, + .can_set_managed_oom = true, + + .init = scope_init, + .load = scope_load, + .done = scope_done, + + .coldplug = scope_coldplug, + + .dump = scope_dump, + + .start = scope_start, + .stop = scope_stop, + + .kill = scope_kill, + + .freeze = unit_freeze_vtable_common, + .thaw = unit_thaw_vtable_common, + + .get_timeout = scope_get_timeout, + + .serialize = scope_serialize, + .deserialize_item = scope_deserialize_item, + + .active_state = scope_active_state, + .sub_state_to_string = scope_sub_state_to_string, + + .sigchld_event = scope_sigchld_event, + + .reset_failed = scope_reset_failed, + + .notify_cgroup_empty = scope_notify_cgroup_empty_event, + + .bus_set_property = bus_scope_set_property, + .bus_commit_properties = bus_scope_commit_properties, + + .enumerate_perpetual = scope_enumerate_perpetual, +}; diff --git a/src/core/scope.h b/src/core/scope.h new file mode 100644 index 0000000..5f791b7 --- /dev/null +++ b/src/core/scope.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Scope Scope; + +#include "cgroup.h" +#include "kill.h" +#include "unit.h" + +typedef enum ScopeResult { + SCOPE_SUCCESS, + SCOPE_FAILURE_RESOURCES, + SCOPE_FAILURE_TIMEOUT, + _SCOPE_RESULT_MAX, + _SCOPE_RESULT_INVALID = -1 +} ScopeResult; + +struct Scope { + Unit meta; + + CGroupContext cgroup_context; + KillContext kill_context; + + ScopeState state, deserialized_state; + ScopeResult result; + + usec_t runtime_max_usec; + usec_t timeout_stop_usec; + + char *controller; + sd_bus_track *controller_track; + + bool was_abandoned; + + sd_event_source *timer_event_source; +}; + +extern const UnitVTable scope_vtable; + +int scope_abandon(Scope *s); + +const char* scope_result_to_string(ScopeResult i) _const_; +ScopeResult scope_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SCOPE, Scope); diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c new file mode 100644 index 0000000..18f6fb5 --- /dev/null +++ b/src/core/selinux-access.c @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "selinux-access.h" + +#if HAVE_SELINUX + +#include <errno.h> +#include <selinux/avc.h> +#include <selinux/selinux.h> +#if HAVE_AUDIT +#include <libaudit.h> +#endif + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "audit-fd.h" +#include "bus-util.h" +#include "errno-util.h" +#include "format-util.h" +#include "log.h" +#include "path-util.h" +#include "selinux-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "util.h" + +static bool initialized = false; + +struct audit_info { + sd_bus_creds *creds; + const char *path; + const char *cmdline; +}; + +/* + Any time an access gets denied this callback will be called + with the audit data. We then need to just copy the audit data into the msgbuf. +*/ +static int audit_callback( + void *auditdata, + security_class_t cls, + char *msgbuf, + size_t msgbufsize) { + + const struct audit_info *audit = auditdata; + uid_t uid = 0, login_uid = 0; + gid_t gid = 0; + char login_uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a"; + char uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a"; + char gid_buf[DECIMAL_STR_MAX(gid_t) + 1] = "n/a"; + + if (sd_bus_creds_get_audit_login_uid(audit->creds, &login_uid) >= 0) + xsprintf(login_uid_buf, UID_FMT, login_uid); + if (sd_bus_creds_get_euid(audit->creds, &uid) >= 0) + xsprintf(uid_buf, UID_FMT, uid); + if (sd_bus_creds_get_egid(audit->creds, &gid) >= 0) + xsprintf(gid_buf, GID_FMT, gid); + + snprintf(msgbuf, msgbufsize, + "auid=%s uid=%s gid=%s%s%s%s%s%s%s", + login_uid_buf, uid_buf, gid_buf, + audit->path ? " path=\"" : "", strempty(audit->path), audit->path ? "\"" : "", + audit->cmdline ? " cmdline=\"" : "", strempty(audit->cmdline), audit->cmdline ? "\"" : ""); + + return 0; +} + +static int callback_type_to_priority(int type) { + switch(type) { + + case SELINUX_ERROR: + return LOG_ERR; + + case SELINUX_WARNING: + return LOG_WARNING; + + case SELINUX_INFO: + return LOG_INFO; + + case SELINUX_AVC: + default: + return LOG_NOTICE; + } +} + +/* + libselinux uses this callback when access gets denied or other + events happen. If audit is turned on, messages will be reported + using audit netlink, otherwise they will be logged using the usual + channels. + + Code copied from dbus and modified. +*/ +_printf_(2, 3) static int log_callback(int type, const char *fmt, ...) { + va_list ap; + const char *fmt2; + +#if HAVE_AUDIT + int fd; + + fd = get_audit_fd(); + + if (fd >= 0) { + _cleanup_free_ char *buf = NULL; + int r; + + va_start(ap, fmt); + r = vasprintf(&buf, fmt, ap); + va_end(ap); + + if (r >= 0) { + if (type == SELINUX_AVC) + audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_AVC, buf, NULL, NULL, NULL, 0); + else if (type == SELINUX_ERROR) + audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_SELINUX_ERR, buf, NULL, NULL, NULL, 0); + + return 0; + } + } +#endif + + fmt2 = strjoina("selinux: ", fmt); + + va_start(ap, fmt); + + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_AUTH | callback_type_to_priority(type), + 0, PROJECT_FILE, __LINE__, __FUNCTION__, + fmt2, ap); + REENABLE_WARNING; + va_end(ap); + + return 0; +} + +static int access_init(sd_bus_error *error) { + + if (!mac_selinux_use()) + return 0; + + if (initialized) + return 1; + + if (avc_open(NULL, 0) != 0) { + int saved_errno = errno; + bool enforce; + + enforce = security_getenforce() != 0; + log_full_errno(enforce ? LOG_ERR : LOG_WARNING, saved_errno, "Failed to open the SELinux AVC: %m"); + + /* If enforcement isn't on, then let's suppress this + * error, and just don't do any AVC checks. The + * warning we printed is hence all the admin will + * see. */ + if (!enforce) + return 0; + + /* Return an access denied error, if we couldn't load + * the AVC but enforcing mode was on, or we couldn't + * determine whether it is one. */ + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to open the SELinux AVC: %s", strerror_safe(saved_errno)); + } + + selinux_set_callback(SELINUX_CB_AUDIT, (union selinux_callback) audit_callback); + selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) log_callback); + + initialized = true; + return 1; +} + +/* + This function communicates with the kernel to check whether or not it should + allow the access. + If the machine is in permissive mode it will return ok. Audit messages will + still be generated if the access would be denied in enforcing mode. +*/ +int mac_selinux_generic_access_check( + sd_bus_message *message, + const char *path, + const char *permission, + sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *tclass, *scon; + _cleanup_free_ char *cl = NULL; + _cleanup_freecon_ char *fcon = NULL; + char **cmdline = NULL; + bool enforce; + int r = 0; + + assert(message); + assert(permission); + assert(error); + + r = access_init(error); + if (r <= 0) + return r; + + /* delay call until we checked in `access_init()` if SELinux is actually enabled */ + enforce = mac_selinux_enforcing(); + + r = sd_bus_query_sender_creds( + message, + SD_BUS_CREDS_PID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID| + SD_BUS_CREDS_CMDLINE|SD_BUS_CREDS_AUDIT_LOGIN_UID| + SD_BUS_CREDS_SELINUX_CONTEXT| + SD_BUS_CREDS_AUGMENT /* get more bits from /proc */, + &creds); + if (r < 0) + return r; + + /* The SELinux context is something we really should have + * gotten directly from the message or sender, and not be an + * augmented field. If it was augmented we cannot use it for + * authorization, since this is racy and vulnerable. Let's add + * an extra check, just in case, even though this really + * shouldn't be possible. */ + assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_SELINUX_CONTEXT) == 0, -EPERM); + + r = sd_bus_creds_get_selinux_context(creds, &scon); + if (r < 0) + return r; + + if (path) { + /* Get the file context of the unit file */ + + if (getfilecon_raw(path, &fcon) < 0) { + r = -errno; + + log_warning_errno(r, "SELinux getfilecon_raw() on '%s' failed%s (perm=%s): %m", + path, + enforce ? "" : ", ignoring", + permission); + if (!enforce) + return 0; + + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get file context on %s.", path); + } + + tclass = "service"; + + } else { + if (getcon_raw(&fcon) < 0) { + r = -errno; + + log_warning_errno(r, "SELinux getcon_raw() failed%s (perm=%s): %m", + enforce ? "" : ", ignoring", + permission); + if (!enforce) + return 0; + + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get current context."); + } + + tclass = "system"; + } + + sd_bus_creds_get_cmdline(creds, &cmdline); + cl = strv_join(cmdline, " "); + + struct audit_info audit_info = { + .creds = creds, + .path = path, + .cmdline = cl, + }; + + r = selinux_check_access(scon, fcon, tclass, permission, &audit_info); + if (r < 0) { + r = errno_or_else(EPERM); + + if (enforce) + sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "SELinux policy denies access."); + } + + log_debug_errno(r, "SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s path=%s cmdline=%s: %m", + scon, fcon, tclass, permission, enforce ? "enforcing" : "permissive", path, cl); + return enforce ? r : 0; +} + +#else /* HAVE_SELINUX */ + +int mac_selinux_generic_access_check( + sd_bus_message *message, + const char *path, + const char *permission, + sd_bus_error *error) { + + return 0; +} + +#endif /* HAVE_SELINUX */ diff --git a/src/core/selinux-access.h b/src/core/selinux-access.h new file mode 100644 index 0000000..c6bfb32 --- /dev/null +++ b/src/core/selinux-access.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "manager.h" + +int mac_selinux_generic_access_check(sd_bus_message *message, const char *path, const char *permission, sd_bus_error *error); + +#define mac_selinux_access_check(message, permission, error) \ + mac_selinux_generic_access_check((message), NULL, (permission), (error)) + +#define mac_selinux_unit_access_check(unit, message, permission, error) \ + mac_selinux_generic_access_check((message), unit_label_path(unit), (permission), (error)) diff --git a/src/core/selinux-setup.c b/src/core/selinux-setup.c new file mode 100644 index 0000000..1ac05b8 --- /dev/null +++ b/src/core/selinux-setup.c @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdio.h> +#include <unistd.h> + +#if HAVE_SELINUX +#include <selinux/selinux.h> +#endif + +#include "log.h" +#include "macro.h" +#include "selinux-setup.h" +#include "selinux-util.h" +#include "string-util.h" +#include "time-util.h" +#include "util.h" + +#if HAVE_SELINUX +_printf_(2,3) +static int null_log(int type, const char *fmt, ...) { + return 0; +} +#endif + +int mac_selinux_setup(bool *loaded_policy) { + +#if HAVE_SELINUX + int enforce = 0; + usec_t before_load, after_load; + char *con; + int r; + static const union selinux_callback cb = { + .func_log = null_log, + }; + + bool initialized = false; + + assert(loaded_policy); + + /* Turn off all of SELinux' own logging, we want to do that */ + selinux_set_callback(SELINUX_CB_LOG, cb); + + /* Don't load policy in the initrd if we don't appear to have + * it. For the real root, we check below if we've already + * loaded policy, and return gracefully. + */ + if (in_initrd() && access(selinux_path(), F_OK) < 0) + return 0; + + /* Already initialized by somebody else? */ + r = getcon_raw(&con); + /* getcon_raw can return 0, and still give us a NULL pointer if + * /proc/self/attr/current is empty. SELinux guarantees this won't + * happen, but that file isn't specific to SELinux, and may be provided + * by some other arbitrary LSM with different semantics. */ + if (r == 0 && con) { + initialized = !streq(con, "kernel"); + freecon(con); + } + + /* Make sure we have no fds open while loading the policy and + * transitioning */ + log_close(); + + /* Now load the policy */ + before_load = now(CLOCK_MONOTONIC); + r = selinux_init_load_policy(&enforce); + if (r == 0) { + _cleanup_(mac_selinux_freep) char *label = NULL; + char timespan[FORMAT_TIMESPAN_MAX]; + + mac_selinux_retest(); + + /* Transition to the new context */ + r = mac_selinux_get_create_label_from_exe(SYSTEMD_BINARY_PATH, &label); + if (r < 0 || !label) { + log_open(); + log_error("Failed to compute init label, ignoring."); + } else { + r = setcon_raw(label); + + log_open(); + if (r < 0) + log_error("Failed to transition into init label '%s', ignoring.", label); + } + + after_load = now(CLOCK_MONOTONIC); + + log_info("Successfully loaded SELinux policy in %s.", + format_timespan(timespan, sizeof(timespan), after_load - before_load, 0)); + + *loaded_policy = true; + + } else { + log_open(); + + if (enforce > 0) { + if (!initialized) + return log_emergency_errno(SYNTHETIC_ERRNO(EIO), + "Failed to load SELinux policy."); + + log_warning("Failed to load new SELinux policy. Continuing with old policy."); + } else + log_debug("Unable to load SELinux policy. Ignoring."); + } +#endif + + return 0; +} diff --git a/src/core/selinux-setup.h b/src/core/selinux-setup.h new file mode 100644 index 0000000..cdff51d --- /dev/null +++ b/src/core/selinux-setup.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +int mac_selinux_setup(bool *loaded_policy); diff --git a/src/core/service.c b/src/core/service.c new file mode 100644 index 0000000..d7bdeb7 --- /dev/null +++ b/src/core/service.c @@ -0,0 +1,4612 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-util.h" +#include "dbus-service.h" +#include "dbus-unit.h" +#include "def.h" +#include "env-util.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "manager.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "service.h" +#include "signal-util.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "utf8.h" +#include "util.h" + +static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = { + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_CONDITION] = UNIT_ACTIVATING, + [SERVICE_START_PRE] = UNIT_ACTIVATING, + [SERVICE_START] = UNIT_ACTIVATING, + [SERVICE_START_POST] = UNIT_ACTIVATING, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_CLEANING] = UNIT_MAINTENANCE, +}; + +/* For Type=idle we never want to delay any other jobs, hence we + * consider idle jobs active as soon as we start working on them */ +static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = { + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_CONDITION] = UNIT_ACTIVE, + [SERVICE_START_PRE] = UNIT_ACTIVE, + [SERVICE_START] = UNIT_ACTIVE, + [SERVICE_START_POST] = UNIT_ACTIVE, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_CLEANING] = UNIT_MAINTENANCE, +}; + +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata); +static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata); + +static void service_enter_signal(Service *s, ServiceState state, ServiceResult f); +static void service_enter_reload_by_notify(Service *s); + +static void service_init(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->timeout_start_usec = u->manager->default_timeout_start_usec; + s->timeout_stop_usec = u->manager->default_timeout_stop_usec; + s->timeout_abort_usec = u->manager->default_timeout_abort_usec; + s->timeout_abort_set = u->manager->default_timeout_abort_set; + s->restart_usec = u->manager->default_restart_usec; + s->runtime_max_usec = USEC_INFINITY; + s->type = _SERVICE_TYPE_INVALID; + s->socket_fd = -1; + s->stdin_fd = s->stdout_fd = s->stderr_fd = -1; + s->guess_main_pid = true; + + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + s->exec_context.keyring_mode = MANAGER_IS_SYSTEM(u->manager) ? + EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT; + + s->watchdog_original_usec = USEC_INFINITY; + + s->oom_policy = _OOM_POLICY_INVALID; +} + +static void service_unwatch_control_pid(Service *s) { + assert(s); + + if (s->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->control_pid); + s->control_pid = 0; +} + +static void service_unwatch_main_pid(Service *s) { + assert(s); + + if (s->main_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->main_pid); + s->main_pid = 0; +} + +static void service_unwatch_pid_file(Service *s) { + if (!s->pid_file_pathspec) + return; + + log_unit_debug(UNIT(s), "Stopping watch for PID file %s", s->pid_file_pathspec->path); + path_spec_unwatch(s->pid_file_pathspec); + path_spec_done(s->pid_file_pathspec); + s->pid_file_pathspec = mfree(s->pid_file_pathspec); +} + +static int service_set_main_pid(Service *s, pid_t pid) { + assert(s); + + if (pid <= 1) + return -EINVAL; + + if (pid == getpid_cached()) + return -EINVAL; + + if (s->main_pid == pid && s->main_pid_known) + return 0; + + if (s->main_pid != pid) { + service_unwatch_main_pid(s); + exec_status_start(&s->main_exec_status, pid); + } + + s->main_pid = pid; + s->main_pid_known = true; + s->main_pid_alien = pid_is_my_child(pid) == 0; + + if (s->main_pid_alien) + log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", pid); + + return 0; +} + +void service_close_socket_fd(Service *s) { + assert(s); + + /* Undo the effect of service_set_socket_fd(). */ + + s->socket_fd = asynchronous_close(s->socket_fd); + + if (UNIT_ISSET(s->accept_socket)) { + socket_connection_unref(SOCKET(UNIT_DEREF(s->accept_socket))); + unit_ref_unset(&s->accept_socket); + } +} + +static void service_stop_watchdog(Service *s) { + assert(s); + + s->watchdog_event_source = sd_event_source_unref(s->watchdog_event_source); + s->watchdog_timestamp = DUAL_TIMESTAMP_NULL; +} + +static void service_start_watchdog(Service *s) { + usec_t watchdog_usec; + int r; + + assert(s); + + watchdog_usec = service_get_watchdog_usec(s); + if (IN_SET(watchdog_usec, 0, USEC_INFINITY)) { + service_stop_watchdog(s); + return; + } + + if (s->watchdog_event_source) { + r = sd_event_source_set_time(s->watchdog_event_source, usec_add(s->watchdog_timestamp.monotonic, watchdog_usec)); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to reset watchdog timer: %m"); + return; + } + + r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ONESHOT); + } else { + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->watchdog_event_source, + CLOCK_MONOTONIC, + usec_add(s->watchdog_timestamp.monotonic, watchdog_usec), 0, + service_dispatch_watchdog, s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to add watchdog timer: %m"); + return; + } + + (void) sd_event_source_set_description(s->watchdog_event_source, "service-watchdog"); + + /* Let's process everything else which might be a sign + * of living before we consider a service died. */ + r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE); + } + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m"); +} + +static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) { + usec_t current; + int r; + + assert(s); + + /* Extends the specified event source timer to at least the specified time, unless it is already later + * anyway. */ + + if (!source) + return; + + r = sd_event_source_get_time(source, ¤t); + if (r < 0) { + const char *desc; + (void) sd_event_source_get_description(s->timer_event_source, &desc); + log_unit_warning_errno(UNIT(s), r, "Failed to retrieve timeout time for event source '%s', ignoring: %m", strna(desc)); + return; + } + + if (current >= extended) /* Current timeout is already longer, ignore this. */ + return; + + r = sd_event_source_set_time(source, extended); + if (r < 0) { + const char *desc; + (void) sd_event_source_get_description(s->timer_event_source, &desc); + log_unit_warning_errno(UNIT(s), r, "Failed to set timeout time for even source '%s', ignoring %m", strna(desc)); + } +} + +static void service_extend_timeout(Service *s, usec_t extend_timeout_usec) { + usec_t extended; + + assert(s); + + if (IN_SET(extend_timeout_usec, 0, USEC_INFINITY)) + return; + + extended = usec_add(now(CLOCK_MONOTONIC), extend_timeout_usec); + + service_extend_event_source_timeout(s, s->timer_event_source, extended); + service_extend_event_source_timeout(s, s->watchdog_event_source, extended); +} + +static void service_reset_watchdog(Service *s) { + assert(s); + + dual_timestamp_get(&s->watchdog_timestamp); + service_start_watchdog(s); +} + +static void service_override_watchdog_timeout(Service *s, usec_t watchdog_override_usec) { + assert(s); + + s->watchdog_override_enable = true; + s->watchdog_override_usec = watchdog_override_usec; + service_reset_watchdog(s); + + log_unit_debug(UNIT(s), "watchdog_usec="USEC_FMT, s->watchdog_usec); + log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec); +} + +static void service_fd_store_unlink(ServiceFDStore *fs) { + + if (!fs) + return; + + if (fs->service) { + assert(fs->service->n_fd_store > 0); + LIST_REMOVE(fd_store, fs->service->fd_store, fs); + fs->service->n_fd_store--; + } + + sd_event_source_disable_unref(fs->event_source); + + free(fs->fdname); + safe_close(fs->fd); + free(fs); +} + +static void service_release_fd_store(Service *s) { + assert(s); + + if (s->n_keep_fd_store > 0) + return; + + log_unit_debug(UNIT(s), "Releasing all stored fds"); + while (s->fd_store) + service_fd_store_unlink(s->fd_store); + + assert(s->n_fd_store == 0); +} + +static void service_release_resources(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (!s->fd_store && s->stdin_fd < 0 && s->stdout_fd < 0 && s->stderr_fd < 0) + return; + + log_unit_debug(u, "Releasing resources."); + + s->stdin_fd = safe_close(s->stdin_fd); + s->stdout_fd = safe_close(s->stdout_fd); + s->stderr_fd = safe_close(s->stderr_fd); + + service_release_fd_store(s); +} + +static void service_done(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + s->pid_file = mfree(s->pid_file); + s->status_text = mfree(s->status_text); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, false); + exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); + s->control_command = NULL; + s->main_command = NULL; + + dynamic_creds_unref(&s->dynamic_creds); + + exit_status_set_free(&s->restart_prevent_status); + exit_status_set_free(&s->restart_force_status); + exit_status_set_free(&s->success_status); + + /* This will leak a process, but at least no memory or any of + * our resources */ + service_unwatch_main_pid(s); + service_unwatch_control_pid(s); + service_unwatch_pid_file(s); + + if (s->bus_name) { + unit_unwatch_bus_name(u, s->bus_name); + s->bus_name = mfree(s->bus_name); + } + + s->bus_name_owner = mfree(s->bus_name_owner); + + s->usb_function_descriptors = mfree(s->usb_function_descriptors); + s->usb_function_strings = mfree(s->usb_function_strings); + + service_close_socket_fd(s); + s->peer = socket_peer_unref(s->peer); + + unit_ref_unset(&s->accept_socket); + + service_stop_watchdog(s); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + service_release_resources(u); +} + +static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) { + ServiceFDStore *fs = userdata; + + assert(e); + assert(fs); + + /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */ + log_unit_debug(UNIT(fs->service), + "Received %s on stored fd %d (%s), closing.", + revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP", + fs->fd, strna(fs->fdname)); + service_fd_store_unlink(fs); + return 0; +} + +static int service_add_fd_store(Service *s, int fd, const char *name, bool do_poll) { + ServiceFDStore *fs; + int r; + + /* fd is always consumed if we return >= 0 */ + + assert(s); + assert(fd >= 0); + + if (s->n_fd_store >= s->n_fd_store_max) + return -EXFULL; /* Our store is full. + * Use this errno rather than E[NM]FILE to distinguish from + * the case where systemd itself hits the file limit. */ + + LIST_FOREACH(fd_store, fs, s->fd_store) { + r = same_fd(fs->fd, fd); + if (r < 0) + return r; + if (r > 0) { + safe_close(fd); + return 0; /* fd already included */ + } + } + + fs = new(ServiceFDStore, 1); + if (!fs) + return -ENOMEM; + + *fs = (ServiceFDStore) { + .fd = fd, + .service = s, + .do_poll = do_poll, + .fdname = strdup(name ?: "stored"), + }; + + if (!fs->fdname) { + free(fs); + return -ENOMEM; + } + + if (do_poll) { + r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fd, 0, on_fd_store_io, fs); + if (r < 0 && r != -EPERM) { /* EPERM indicates fds that aren't pollable, which is OK */ + free(fs->fdname); + free(fs); + return r; + } else if (r >= 0) + (void) sd_event_source_set_description(fs->event_source, "service-fd-store"); + } + + LIST_PREPEND(fd_store, s->fd_store, fs); + s->n_fd_store++; + + return 1; /* fd newly stored */ +} + +static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll) { + int r; + + assert(s); + + while (fdset_size(fds) > 0) { + _cleanup_close_ int fd = -1; + + fd = fdset_steal_first(fds); + if (fd < 0) + break; + + r = service_add_fd_store(s, fd, name, do_poll); + if (r == -EXFULL) + return log_unit_warning_errno(UNIT(s), r, + "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.", + s->n_fd_store_max); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m"); + if (r > 0) + log_unit_debug(UNIT(s), "Added fd %u (%s) to fd store.", fd, strna(name)); + fd = -1; + } + + return 0; +} + +static void service_remove_fd_store(Service *s, const char *name) { + ServiceFDStore *fs, *n; + + assert(s); + assert(name); + + LIST_FOREACH_SAFE(fd_store, fs, n, s->fd_store) { + if (!streq(fs->fdname, name)) + continue; + + log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name); + service_fd_store_unlink(fs); + } +} + +static int service_arm_timer(Service *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + service_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "service-timer"); + + return 0; +} + +static int service_verify(Service *s) { + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (!s->exec_command[SERVICE_EXEC_START] && !s->exec_command[SERVICE_EXEC_STOP] + && UNIT(s)->success_action == EMERGENCY_ACTION_NONE) { + /* FailureAction= only makes sense if one of the start or stop commands is specified. + * SuccessAction= will be executed unconditionally if no commands are specified. Hence, + * either a command or SuccessAction= are required. */ + + log_unit_error(UNIT(s), "Service has no ExecStart=, ExecStop=, or SuccessAction=. Refusing."); + return -ENOEXEC; + } + + if (s->type != SERVICE_ONESHOT && !s->exec_command[SERVICE_EXEC_START]) { + log_unit_error(UNIT(s), "Service has no ExecStart= setting, which is only allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (!s->remain_after_exit && !s->exec_command[SERVICE_EXEC_START] && UNIT(s)->success_action == EMERGENCY_ACTION_NONE) { + log_unit_error(UNIT(s), "Service has no ExecStart= and no SuccessAction= settings and does not have RemainAfterExit=yes set. Refusing."); + return -ENOEXEC; + } + + if (s->type != SERVICE_ONESHOT && s->exec_command[SERVICE_EXEC_START]->command_next) { + log_unit_error(UNIT(s), "Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (s->type == SERVICE_ONESHOT + && !IN_SET(s->restart, SERVICE_RESTART_NO, SERVICE_RESTART_ON_FAILURE, SERVICE_RESTART_ON_ABNORMAL, SERVICE_RESTART_ON_WATCHDOG, SERVICE_RESTART_ON_ABORT)) { + log_unit_error(UNIT(s), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status)) { + log_unit_error(UNIT(s), "Service has RestartForceStatus= set, which isn't allowed for Type=oneshot services. Refusing."); + return -ENOEXEC; + } + + if (s->type == SERVICE_DBUS && !s->bus_name) { + log_unit_error(UNIT(s), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing."); + return -ENOEXEC; + } + + if (s->exec_context.pam_name && !IN_SET(s->kill_context.kill_mode, KILL_CONTROL_GROUP, KILL_MIXED)) { + log_unit_error(UNIT(s), "Service has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing."); + return -ENOEXEC; + } + + if (s->usb_function_descriptors && !s->usb_function_strings) + log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring."); + + if (!s->usb_function_descriptors && s->usb_function_strings) + log_unit_warning(UNIT(s), "Service has USBFunctionStrings= setting, but no USBFunctionDescriptors=. Ignoring."); + + if (s->runtime_max_usec != USEC_INFINITY && s->type == SERVICE_ONESHOT) + log_unit_warning(UNIT(s), "RuntimeMaxSec= has no effect in combination with Type=oneshot. Ignoring."); + + return 0; +} + +static int service_add_default_dependencies(Service *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Add a number of automatic dependencies useful for the + * majority of services. */ + + if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) { + /* First, pull in the really early boot stuff, and + * require it, so that we fail if we can't acquire + * it. */ + + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } else { + + /* In the --user instance there's no sysinit.target, + * in that case require basic.target instead. */ + + r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + /* Second, if the rest of the base system is in the same + * transaction, order us after it, but do not pull it in or + * even require it. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + /* Third, add us in for normal shutdown. */ + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static void service_fix_stdio(Service *s) { + assert(s); + + /* Note that EXEC_INPUT_NULL and EXEC_OUTPUT_INHERIT play a special role here: they are both the + * default value that is subject to automatic overriding triggered by other settings and an explicit + * choice the user can make. We don't distinguish between these cases currently. */ + + if (s->exec_context.std_input == EXEC_INPUT_NULL && + s->exec_context.stdin_data_size > 0) + s->exec_context.std_input = EXEC_INPUT_DATA; + + if (IN_SET(s->exec_context.std_input, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL, + EXEC_INPUT_SOCKET, + EXEC_INPUT_NAMED_FD)) + return; + + /* We assume these listed inputs refer to bidirectional streams, and hence duplicating them from + * stdin to stdout/stderr makes sense and hence leaving EXEC_OUTPUT_INHERIT in place makes sense, + * too. Outputs such as regular files or sealed data memfds otoh don't really make sense to be + * duplicated for both input and output at the same time (since they then would cause a feedback + * loop), hence override EXEC_OUTPUT_INHERIT with the default stderr/stdout setting. */ + + if (s->exec_context.std_error == EXEC_OUTPUT_INHERIT && + s->exec_context.std_output == EXEC_OUTPUT_INHERIT) + s->exec_context.std_error = UNIT(s)->manager->default_std_error; + + if (s->exec_context.std_output == EXEC_OUTPUT_INHERIT) + s->exec_context.std_output = UNIT(s)->manager->default_std_output; +} + +static int service_setup_bus_name(Service *s) { + int r; + + assert(s); + + if (s->type != SERVICE_DBUS) + return 0; + + r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m"); + + /* We always want to be ordered against dbus.socket if both are in the transaction. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m"); + + r = unit_watch_bus_name(UNIT(s), s->bus_name); + if (r == -EEXIST) + return log_unit_error_errno(UNIT(s), r, "Two services allocated for the same bus name %s, refusing operation.", s->bus_name); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Cannot watch bus name %s: %m", s->bus_name); + + return 0; +} + +static int service_add_extras(Service *s) { + int r; + + assert(s); + + if (s->type == _SERVICE_TYPE_INVALID) { + /* Figure out a type automatically */ + if (s->bus_name) + s->type = SERVICE_DBUS; + else if (s->exec_command[SERVICE_EXEC_START]) + s->type = SERVICE_SIMPLE; + else + s->type = SERVICE_ONESHOT; + } + + /* Oneshot services have disabled start timeout by default */ + if (s->type == SERVICE_ONESHOT && !s->start_timeout_defined) + s->timeout_start_usec = USEC_INFINITY; + + service_fix_stdio(s); + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(UNIT(s), &s->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + /* If the service needs the notify socket, let's enable it automatically. */ + if (s->notify_access == NOTIFY_NONE && + (s->type == SERVICE_NOTIFY || s->watchdog_usec > 0 || s->n_fd_store_max > 0)) + s->notify_access = NOTIFY_MAIN; + + /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when + * delegation is on, in that case it we assume the payload knows better what to do and can process + * things in a more focused way. */ + if (s->oom_policy < 0) + s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->default_oom_policy; + + /* Let the kernel do the killing if that's requested. */ + s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL; + + r = service_add_default_dependencies(s); + if (r < 0) + return r; + + r = service_setup_bus_name(s); + if (r < 0) + return r; + + return 0; +} + +static int service_load(Unit *u) { + Service *s = SERVICE(u); + int r; + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = service_add_extras(s); + if (r < 0) + return r; + + return service_verify(s); +} + +static void service_dump(Unit *u, FILE *f, const char *prefix) { + char buf_restart[FORMAT_TIMESPAN_MAX], buf_start[FORMAT_TIMESPAN_MAX], buf_stop[FORMAT_TIMESPAN_MAX], + buf_runtime[FORMAT_TIMESPAN_MAX], buf_watchdog[FORMAT_TIMESPAN_MAX], buf_abort[FORMAT_TIMESPAN_MAX]; + ServiceExecCommand c; + Service *s = SERVICE(u); + const char *prefix2; + + assert(s); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%sService State: %s\n" + "%sResult: %s\n" + "%sReload Result: %s\n" + "%sClean Result: %s\n" + "%sPermissionsStartOnly: %s\n" + "%sRootDirectoryStartOnly: %s\n" + "%sRemainAfterExit: %s\n" + "%sGuessMainPID: %s\n" + "%sType: %s\n" + "%sRestart: %s\n" + "%sNotifyAccess: %s\n" + "%sNotifyState: %s\n" + "%sOOMPolicy: %s\n", + prefix, service_state_to_string(s->state), + prefix, service_result_to_string(s->result), + prefix, service_result_to_string(s->reload_result), + prefix, service_result_to_string(s->clean_result), + prefix, yes_no(s->permissions_start_only), + prefix, yes_no(s->root_directory_start_only), + prefix, yes_no(s->remain_after_exit), + prefix, yes_no(s->guess_main_pid), + prefix, service_type_to_string(s->type), + prefix, service_restart_to_string(s->restart), + prefix, notify_access_to_string(s->notify_access), + prefix, notify_state_to_string(s->notify_state), + prefix, oom_policy_to_string(s->oom_policy)); + + if (s->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid); + + if (s->main_pid > 0) + fprintf(f, + "%sMain PID: "PID_FMT"\n" + "%sMain PID Known: %s\n" + "%sMain PID Alien: %s\n", + prefix, s->main_pid, + prefix, yes_no(s->main_pid_known), + prefix, yes_no(s->main_pid_alien)); + + if (s->pid_file) + fprintf(f, + "%sPIDFile: %s\n", + prefix, s->pid_file); + + if (s->bus_name) + fprintf(f, + "%sBusName: %s\n" + "%sBus Name Good: %s\n", + prefix, s->bus_name, + prefix, yes_no(s->bus_name_good)); + + if (UNIT_ISSET(s->accept_socket)) + fprintf(f, + "%sAccept Socket: %s\n", + prefix, UNIT_DEREF(s->accept_socket)->id); + + fprintf(f, + "%sRestartSec: %s\n" + "%sTimeoutStartSec: %s\n" + "%sTimeoutStopSec: %s\n" + "%sTimeoutStartFailureMode: %s\n" + "%sTimeoutStopFailureMode: %s\n", + prefix, format_timespan(buf_restart, sizeof(buf_restart), s->restart_usec, USEC_PER_SEC), + prefix, format_timespan(buf_start, sizeof(buf_start), s->timeout_start_usec, USEC_PER_SEC), + prefix, format_timespan(buf_stop, sizeof(buf_stop), s->timeout_stop_usec, USEC_PER_SEC), + prefix, service_timeout_failure_mode_to_string(s->timeout_start_failure_mode), + prefix, service_timeout_failure_mode_to_string(s->timeout_stop_failure_mode)); + + if (s->timeout_abort_set) + fprintf(f, + "%sTimeoutAbortSec: %s\n", + prefix, format_timespan(buf_abort, sizeof(buf_abort), s->timeout_abort_usec, USEC_PER_SEC)); + + fprintf(f, + "%sRuntimeMaxSec: %s\n" + "%sWatchdogSec: %s\n", + prefix, format_timespan(buf_runtime, sizeof(buf_runtime), s->runtime_max_usec, USEC_PER_SEC), + prefix, format_timespan(buf_watchdog, sizeof(buf_watchdog), s->watchdog_usec, USEC_PER_SEC)); + + kill_context_dump(&s->kill_context, f, prefix); + exec_context_dump(&s->exec_context, f, prefix); + + for (c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) { + + if (!s->exec_command[c]) + continue; + + fprintf(f, "%s-> %s:\n", + prefix, service_exec_command_to_string(c)); + + exec_command_dump_list(s->exec_command[c], f, prefix2); + } + + if (s->status_text) + fprintf(f, "%sStatus Text: %s\n", + prefix, s->status_text); + + if (s->n_fd_store_max > 0) + fprintf(f, + "%sFile Descriptor Store Max: %u\n" + "%sFile Descriptor Store Current: %zu\n", + prefix, s->n_fd_store_max, + prefix, s->n_fd_store); + + cgroup_context_dump(UNIT(s), f, prefix); +} + +static int service_is_suitable_main_pid(Service *s, pid_t pid, int prio) { + Unit *owner; + + assert(s); + assert(pid_is_valid(pid)); + + /* Checks whether the specified PID is suitable as main PID for this service. returns negative if not, 0 if the + * PID is questionnable but should be accepted if the source of configuration is trusted. > 0 if the PID is + * good */ + + if (pid == getpid_cached() || pid == 1) { + log_unit_full(UNIT(s), prio, "New main PID "PID_FMT" is the manager, refusing.", pid); + return -EPERM; + } + + if (pid == s->control_pid) { + log_unit_full(UNIT(s), prio, "New main PID "PID_FMT" is the control process, refusing.", pid); + return -EPERM; + } + + if (!pid_is_alive(pid)) { + log_unit_full(UNIT(s), prio, "New main PID "PID_FMT" does not exist or is a zombie.", pid); + return -ESRCH; + } + + owner = manager_get_unit_by_pid(UNIT(s)->manager, pid); + if (owner == UNIT(s)) { + log_unit_debug(UNIT(s), "New main PID "PID_FMT" belongs to service, we are happy.", pid); + return 1; /* Yay, it's definitely a good PID */ + } + + return 0; /* Hmm it's a suspicious PID, let's accept it if configuration source is trusted */ +} + +static int service_load_pid_file(Service *s, bool may_warn) { + char procfs[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; + bool questionable_pid_file = false; + _cleanup_free_ char *k = NULL; + _cleanup_close_ int fd = -1; + int r, prio; + pid_t pid; + + assert(s); + + if (!s->pid_file) + return -ENOENT; + + prio = may_warn ? LOG_INFO : LOG_DEBUG; + + r = chase_symlinks(s->pid_file, NULL, CHASE_SAFE, NULL, &fd); + if (r == -ENOLINK) { + log_unit_debug_errno(UNIT(s), r, + "Potentially unsafe symlink chain, will now retry with relaxed checks: %s", s->pid_file); + + questionable_pid_file = true; + + r = chase_symlinks(s->pid_file, NULL, 0, NULL, &fd); + } + if (r < 0) + return log_unit_full_errno(UNIT(s), prio, fd, + "Can't open PID file %s (yet?) after %s: %m", s->pid_file, service_state_to_string(s->state)); + + /* Let's read the PID file now that we chased it down. But we need to convert the O_PATH fd + * chase_symlinks() returned us into a proper fd first. */ + xsprintf(procfs, "/proc/self/fd/%i", fd); + r = read_one_line_file(procfs, &k); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, + "Can't convert PID files %s O_PATH file descriptor to proper file descriptor: %m", + s->pid_file); + + r = parse_pid(k, &pid); + if (r < 0) + return log_unit_full_errno(UNIT(s), prio, r, "Failed to parse PID from file %s: %m", s->pid_file); + + if (s->main_pid_known && pid == s->main_pid) + return 0; + + r = service_is_suitable_main_pid(s, pid, prio); + if (r < 0) + return r; + if (r == 0) { + struct stat st; + + if (questionable_pid_file) { + log_unit_error(UNIT(s), "Refusing to accept PID outside of service control group, acquired through unsafe symlink chain: %s", s->pid_file); + return -EPERM; + } + + /* Hmm, it's not clear if the new main PID is safe. Let's allow this if the PID file is owned by root */ + + if (fstat(fd, &st) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to fstat() PID file O_PATH fd: %m"); + + if (st.st_uid != 0) { + log_unit_error(UNIT(s), "New main PID "PID_FMT" does not belong to service, and PID file is not owned by root. Refusing.", pid); + return -EPERM; + } + + log_unit_debug(UNIT(s), "New main PID "PID_FMT" does not belong to service, but we'll accept it since PID file is owned by root.", pid); + } + + if (s->main_pid_known) { + log_unit_debug(UNIT(s), "Main PID changing: "PID_FMT" -> "PID_FMT, s->main_pid, pid); + + service_unwatch_main_pid(s); + s->main_pid_known = false; + } else + log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pid); + + r = service_set_main_pid(s, pid); + if (r < 0) + return r; + + r = unit_watch_pid(UNIT(s), pid, false); + if (r < 0) /* FIXME: we need to do something here */ + return log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" for service: %m", pid); + + return 1; +} + +static void service_search_main_pid(Service *s) { + pid_t pid = 0; + int r; + + assert(s); + + /* If we know it anyway, don't ever fall back to unreliable + * heuristics */ + if (s->main_pid_known) + return; + + if (!s->guess_main_pid) + return; + + assert(s->main_pid <= 0); + + if (unit_search_main_pid(UNIT(s), &pid) < 0) + return; + + log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid); + if (service_set_main_pid(s, pid) < 0) + return; + + r = unit_watch_pid(UNIT(s), pid, false); + if (r < 0) + /* FIXME: we need to do something here */ + log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" from: %m", pid); +} + +static void service_set_state(Service *s, ServiceState state) { + ServiceState old_state; + const UnitActiveState *table; + + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + + old_state = s->state; + s->state = state; + + service_unwatch_pid_file(s); + + if (!IN_SET(state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_AUTO_RESTART, + SERVICE_CLEANING)) + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + if (!IN_SET(state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { + service_unwatch_main_pid(s); + s->main_command = NULL; + } + + if (!IN_SET(state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_CLEANING)) { + service_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + } + + if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { + unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } + + if (!IN_SET(state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) && + !(state == SERVICE_DEAD && UNIT(s)->job)) + service_close_socket_fd(s); + + if (state != SERVICE_START) + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + service_stop_watchdog(s); + + /* For the inactive states unit_notify() will trim the cgroup, + * but for exit we have to do that ourselves... */ + if (state == SERVICE_EXITED && !MANAGER_IS_RELOADING(UNIT(s)->manager)) + unit_prune_cgroup(UNIT(s)); + + if (old_state != state) + log_unit_debug(UNIT(s), "Changed %s -> %s", service_state_to_string(old_state), service_state_to_string(state)); + + unit_notify(UNIT(s), table[old_state], table[state], + (s->reload_result == SERVICE_SUCCESS ? 0 : UNIT_NOTIFY_RELOAD_FAILURE) | + (s->will_auto_restart ? UNIT_NOTIFY_WILL_AUTO_RESTART : 0) | + (s->result == SERVICE_SKIP_CONDITION ? UNIT_NOTIFY_SKIP_CONDITION : 0)); +} + +static usec_t service_coldplug_timeout(Service *s) { + assert(s); + + switch (s->deserialized_state) { + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_RELOAD: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_start_usec); + + case SERVICE_RUNNING: + return usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec); + + case SERVICE_STOP: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec); + + case SERVICE_STOP_WATCHDOG: + case SERVICE_FINAL_WATCHDOG: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s)); + + case SERVICE_AUTO_RESTART: + return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, s->restart_usec); + + case SERVICE_CLEANING: + return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->exec_context.timeout_clean_usec); + + default: + return USEC_INFINITY; + } +} + +static int service_coldplug(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + assert(s->state == SERVICE_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + r = service_arm_timer(s, service_coldplug_timeout(s)); + if (r < 0) + return r; + + if (s->main_pid > 0 && + pid_is_unwaited(s->main_pid) && + (IN_SET(s->deserialized_state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) { + r = unit_watch_pid(UNIT(s), s->main_pid, false); + if (r < 0) + return r; + } + + if (s->control_pid > 0 && + pid_is_unwaited(s->control_pid) && + IN_SET(s->deserialized_state, + SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_CLEANING)) { + r = unit_watch_pid(UNIT(s), s->control_pid, false); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART, SERVICE_CLEANING)) { + (void) unit_enqueue_rewatch_pids(u); + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + service_start_watchdog(s); + + if (UNIT_ISSET(s->accept_socket)) { + Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket)); + + if (socket->max_connections_per_source > 0) { + SocketPeer *peer; + + /* Make a best-effort attempt at bumping the connection count */ + if (socket_acquire_peer(socket, s->socket_fd, &peer) > 0) { + socket_peer_unref(s->peer); + s->peer = peer; + } + } + } + + service_set_state(s, s->deserialized_state); + return 0; +} + +static int service_collect_fds( + Service *s, + int **fds, + char ***fd_names, + size_t *n_socket_fds, + size_t *n_storage_fds) { + + _cleanup_strv_free_ char **rfd_names = NULL; + _cleanup_free_ int *rfds = NULL; + size_t rn_socket_fds = 0, rn_storage_fds = 0; + int r; + + assert(s); + assert(fds); + assert(fd_names); + assert(n_socket_fds); + assert(n_storage_fds); + + if (s->socket_fd >= 0) { + + /* Pass the per-connection socket */ + + rfds = new(int, 1); + if (!rfds) + return -ENOMEM; + rfds[0] = s->socket_fd; + + rfd_names = strv_new("connection"); + if (!rfd_names) + return -ENOMEM; + + rn_socket_fds = 1; + } else { + void *v; + Unit *u; + + /* Pass all our configured sockets for singleton services */ + + HASHMAP_FOREACH_KEY(v, u, UNIT(s)->dependencies[UNIT_TRIGGERED_BY]) { + _cleanup_free_ int *cfds = NULL; + Socket *sock; + int cn_fds; + + if (u->type != UNIT_SOCKET) + continue; + + sock = SOCKET(u); + + cn_fds = socket_collect_fds(sock, &cfds); + if (cn_fds < 0) + return cn_fds; + + if (cn_fds <= 0) + continue; + + if (!rfds) { + rfds = TAKE_PTR(cfds); + rn_socket_fds = cn_fds; + } else { + int *t; + + t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int)); + if (!t) + return -ENOMEM; + + memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int)); + + rfds = t; + rn_socket_fds += cn_fds; + } + + r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds); + if (r < 0) + return r; + } + } + + if (s->n_fd_store > 0) { + ServiceFDStore *fs; + size_t n_fds; + char **nl; + int *t; + + t = reallocarray(rfds, rn_socket_fds + s->n_fd_store, sizeof(int)); + if (!t) + return -ENOMEM; + + rfds = t; + + nl = reallocarray(rfd_names, rn_socket_fds + s->n_fd_store + 1, sizeof(char *)); + if (!nl) + return -ENOMEM; + + rfd_names = nl; + n_fds = rn_socket_fds; + + LIST_FOREACH(fd_store, fs, s->fd_store) { + rfds[n_fds] = fs->fd; + rfd_names[n_fds] = strdup(strempty(fs->fdname)); + if (!rfd_names[n_fds]) + return -ENOMEM; + + rn_storage_fds++; + n_fds++; + } + + rfd_names[n_fds] = NULL; + } + + *fds = TAKE_PTR(rfds); + *fd_names = TAKE_PTR(rfd_names); + *n_socket_fds = rn_socket_fds; + *n_storage_fds = rn_storage_fds; + + return 0; +} + +static int service_allocate_exec_fd_event_source( + Service *s, + int fd, + sd_event_source **ret_event_source) { + + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + assert(fd >= 0); + assert(ret_event_source); + + r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m"); + + /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */ + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m"); + + (void) sd_event_source_set_description(source, "service event_fd"); + + r = sd_event_source_set_io_fd_own(source, true); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m"); + + *ret_event_source = TAKE_PTR(source); + return 0; +} + +static int service_allocate_exec_fd( + Service *s, + sd_event_source **ret_event_source, + int* ret_exec_fd) { + + _cleanup_close_pair_ int p[2] = { -1, -1 }; + int r; + + assert(s); + assert(ret_event_source); + assert(ret_exec_fd); + + if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m"); + + r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source); + if (r < 0) + return r; + + p[0] = -1; + *ret_exec_fd = TAKE_FD(p[1]); + + return 0; +} + +static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) { + assert(s); + + /* Notifications are accepted depending on the process and + * the access setting of the service: + * process: \ access: NONE MAIN EXEC ALL + * main no yes yes yes + * control no no yes yes + * other (forked) no no no yes */ + + if (flags & EXEC_IS_CONTROL) + /* A control process */ + return IN_SET(s->notify_access, NOTIFY_EXEC, NOTIFY_ALL); + + /* We only spawn main processes and control processes, so any + * process that is not a control process is a main process */ + return s->notify_access != NOTIFY_NONE; +} + +static int service_spawn( + Service *s, + ExecCommand *c, + usec_t timeout, + ExecFlags flags, + pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = flags, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL; + _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL; + size_t n_env = 0; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(s)); /* This realizes the cgroup, among other things */ + if (r < 0) + return r; + + if (flags & EXEC_IS_CONTROL) { + /* If this is a control process, mask the permissions/chroot application if this is requested. */ + if (s->permissions_start_only) + exec_params.flags &= ~EXEC_APPLY_SANDBOXING; + if (s->root_directory_start_only) + exec_params.flags &= ~EXEC_APPLY_CHROOT; + } + + if ((flags & EXEC_PASS_FDS) || + s->exec_context.std_input == EXEC_INPUT_SOCKET || + s->exec_context.std_output == EXEC_OUTPUT_SOCKET || + s->exec_context.std_error == EXEC_OUTPUT_SOCKET) { + + r = service_collect_fds(s, + &exec_params.fds, + &exec_params.fd_names, + &exec_params.n_socket_fds, + &exec_params.n_storage_fds); + if (r < 0) + return r; + + log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds); + } + + if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + assert(!s->exec_fd_event_source); + + r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd); + if (r < 0) + return r; + } + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout)); + if (r < 0) + return r; + + our_env = new0(char*, 10); + if (!our_env) + return -ENOMEM; + + if (service_exec_needs_notify_socket(s, flags)) + if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0) + return -ENOMEM; + + if (s->main_pid > 0) + if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid) < 0) + return -ENOMEM; + + if (MANAGER_IS_USER(UNIT(s)->manager)) + if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid_cached()) < 0) + return -ENOMEM; + + if (s->pid_file) + if (asprintf(our_env + n_env++, "PIDFILE=%s", s->pid_file) < 0) + return -ENOMEM; + + if (s->socket_fd >= 0) { + union sockaddr_union sa; + socklen_t salen = sizeof(sa); + + /* If this is a per-connection service instance, let's set $REMOTE_ADDR and $REMOTE_PORT to something + * useful. Note that we do this only when we are still connected at this point in time, which we might + * very well not be. Hence we ignore all errors when retrieving peer information (as that might result + * in ENOTCONN), and just use whate we can use. */ + + if (getpeername(s->socket_fd, &sa.sa, &salen) >= 0 && + IN_SET(sa.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + _cleanup_free_ char *addr = NULL; + char *t; + unsigned port; + + r = sockaddr_pretty(&sa.sa, salen, true, false, &addr); + if (r < 0) + return r; + + t = strjoin("REMOTE_ADDR=", addr); + if (!t) + return -ENOMEM; + our_env[n_env++] = t; + + r = sockaddr_port(&sa.sa, &port); + if (r < 0) + return r; + + if (asprintf(&t, "REMOTE_PORT=%u", port) < 0) + return -ENOMEM; + our_env[n_env++] = t; + } + } + + if (flags & EXEC_SETENV_RESULT) { + if (asprintf(our_env + n_env++, "SERVICE_RESULT=%s", service_result_to_string(s->result)) < 0) + return -ENOMEM; + + if (s->main_exec_status.pid > 0 && + dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { + if (asprintf(our_env + n_env++, "EXIT_CODE=%s", sigchld_code_to_string(s->main_exec_status.code)) < 0) + return -ENOMEM; + + if (s->main_exec_status.code == CLD_EXITED) + r = asprintf(our_env + n_env++, "EXIT_STATUS=%i", s->main_exec_status.status); + else + r = asprintf(our_env + n_env++, "EXIT_STATUS=%s", signal_to_string(s->main_exec_status.status)); + if (r < 0) + return -ENOMEM; + } + } + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + final_env = strv_env_merge(2, exec_params.environment, our_env, NULL); + if (!final_env) + return -ENOMEM; + + /* System D-Bus needs nss-systemd disabled, so that we don't deadlock */ + SET_FLAG(exec_params.flags, EXEC_NSS_BYPASS_BUS, + MANAGER_IS_SYSTEM(UNIT(s)->manager) && unit_has_name(UNIT(s), SPECIAL_DBUS_SERVICE)); + + strv_free_and_replace(exec_params.environment, final_env); + exec_params.watchdog_usec = service_get_watchdog_usec(s); + exec_params.selinux_context_net = s->socket_fd_selinux_context_net; + if (s->type == SERVICE_IDLE) + exec_params.idle_pipe = UNIT(s)->manager->idle_pipe; + exec_params.stdin_fd = s->stdin_fd; + exec_params.stdout_fd = s->stdout_fd; + exec_params.stderr_fd = s->stderr_fd; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->dynamic_creds, + &pid); + if (r < 0) + return r; + + s->exec_fd_event_source = TAKE_PTR(exec_fd_source); + s->exec_fd_hot = false; + + r = unit_watch_pid(UNIT(s), pid, true); + if (r < 0) + return r; + + *_pid = pid; + + return 0; +} + +static int main_pid_good(Service *s) { + assert(s); + + /* Returns 0 if the pid is dead, > 0 if it is good, < 0 if we don't know */ + + /* If we know the pid file, then let's just check if it is + * still valid */ + if (s->main_pid_known) { + + /* If it's an alien child let's check if it is still + * alive ... */ + if (s->main_pid_alien && s->main_pid > 0) + return pid_is_alive(s->main_pid); + + /* .. otherwise assume we'll get a SIGCHLD for it, + * which we really should wait for to collect exit + * status and code */ + return s->main_pid > 0; + } + + /* We don't know the pid */ + return -EAGAIN; +} + +static int control_pid_good(Service *s) { + assert(s); + + /* Returns 0 if the control PID is dead, > 0 if it is good. We never actually return < 0 here, but in order to + * make this function as similar as possible to main_pid_good() and cgroup_good(), we pretend that < 0 also + * means: we can't figure it out. */ + + return s->control_pid > 0; +} + +static int cgroup_good(Service *s) { + int r; + + assert(s); + + /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't + * figure it out */ + + if (!UNIT(s)->cgroup_path) + return 0; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path); + if (r < 0) + return r; + + return r == 0; +} + +static bool service_shall_restart(Service *s, const char **reason) { + assert(s); + + /* Don't restart after manual stops */ + if (s->forbid_restart) { + *reason = "manual stop"; + return false; + } + + /* Never restart if this is configured as special exception */ + if (exit_status_set_test(&s->restart_prevent_status, s->main_exec_status.code, s->main_exec_status.status)) { + *reason = "prevented by exit status"; + return false; + } + + /* Restart if the exit code/status are configured as restart triggers */ + if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) { + *reason = "forced by exit status"; + return true; + } + + *reason = "restart setting"; + switch (s->restart) { + + case SERVICE_RESTART_NO: + return false; + + case SERVICE_RESTART_ALWAYS: + return true; + + case SERVICE_RESTART_ON_SUCCESS: + return s->result == SERVICE_SUCCESS; + + case SERVICE_RESTART_ON_FAILURE: + return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_SKIP_CONDITION); + + case SERVICE_RESTART_ON_ABNORMAL: + return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_FAILURE_EXIT_CODE, SERVICE_SKIP_CONDITION); + + case SERVICE_RESTART_ON_WATCHDOG: + return s->result == SERVICE_FAILURE_WATCHDOG; + + case SERVICE_RESTART_ON_ABORT: + return IN_SET(s->result, SERVICE_FAILURE_SIGNAL, SERVICE_FAILURE_CORE_DUMP); + + default: + assert_not_reached("unknown restart setting"); + } +} + +static bool service_will_restart(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (s->will_auto_restart) + return true; + if (s->state == SERVICE_AUTO_RESTART) + return true; + + return unit_will_restart_default(u); +} + +static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) { + ServiceState end_state; + int r; + + assert(s); + + /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not + * undo what has already been enqueued. */ + if (unit_stop_pending(UNIT(s))) + allow_restart = false; + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->result == SERVICE_SUCCESS) { + unit_log_success(UNIT(s)); + end_state = SERVICE_DEAD; + } else if (s->result == SERVICE_SKIP_CONDITION) { + unit_log_skip(UNIT(s), service_result_to_string(s->result)); + end_state = SERVICE_DEAD; + } else { + unit_log_failure(UNIT(s), service_result_to_string(s->result)); + end_state = SERVICE_FAILED; + } + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop); + + if (!allow_restart) + log_unit_debug(UNIT(s), "Service restart not allowed."); + else { + const char *reason; + bool shall_restart; + + shall_restart = service_shall_restart(s, &reason); + log_unit_debug(UNIT(s), "Service will %srestart (%s)", + shall_restart ? "" : "not ", + reason); + if (shall_restart) + s->will_auto_restart = true; + } + + /* Make sure service_release_resources() doesn't destroy our FD store, while we are changing through + * SERVICE_FAILED/SERVICE_DEAD before entering into SERVICE_AUTO_RESTART. */ + s->n_keep_fd_store ++; + + service_set_state(s, end_state); + + if (s->will_auto_restart) { + s->will_auto_restart = false; + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->restart_usec)); + if (r < 0) { + s->n_keep_fd_store--; + goto fail; + } + + service_set_state(s, SERVICE_AUTO_RESTART); + } else + /* If we shan't restart, then flush out the restart counter. But don't do that immediately, so that the + * user can still introspect the counter. Do so on the next start. */ + s->flush_n_restarts = true; + + /* The new state is in effect, let's decrease the fd store ref counter again. Let's also re-add us to the GC + * queue, so that the fd store is possibly gc'ed again */ + s->n_keep_fd_store--; + unit_add_to_gc_queue(UNIT(s)); + + /* The next restart might not be a manual stop, hence reset the flag indicating manual stops */ + s->forbid_restart = false; + + /* We want fresh tmpdirs in case service is started again immediately */ + s->exec_runtime = exec_runtime_unref(s->exec_runtime, true); + + /* Also, remove the runtime directory */ + unit_destroy_runtime_data(UNIT(s), &s->exec_context); + + /* Get rid of the IPC bits of the user */ + unit_unref_uid_gid(UNIT(s), true); + + /* Release the user, and destroy it if we are the only remaining owner */ + dynamic_creds_destroy(&s->dynamic_creds); + + /* Try to delete the pid file. At this point it will be + * out-of-date, and some software might be confused by it, so + * let's remove it. */ + if (s->pid_file) + (void) unlink(s->pid_file); + + /* Reset TTY ownership if necessary */ + exec_context_revert_tty(&s->exec_context); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run install restart timer: %m"); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, false); +} + +static void service_enter_stop_post(Service *s, ServiceResult f) { + int r; + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_STOP_POST; + + r = service_spawn(s, + s->control_command, + s->timeout_stop_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_STOP_POST); + } else + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-post' task: %m"); + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static int state_to_kill_operation(Service *s, ServiceState state) { + switch (state) { + + case SERVICE_STOP_WATCHDOG: + case SERVICE_FINAL_WATCHDOG: + return KILL_WATCHDOG; + + case SERVICE_STOP_SIGTERM: + if (unit_has_job_type(UNIT(s), JOB_RESTART)) + return KILL_RESTART; + _fallthrough_; + + case SERVICE_FINAL_SIGTERM: + return KILL_TERMINATE; + + case SERVICE_STOP_SIGKILL: + case SERVICE_FINAL_SIGKILL: + return KILL_KILL; + + default: + return _KILL_OPERATION_INVALID; + } +} + +static void service_enter_signal(Service *s, ServiceState state, ServiceResult f) { + int kill_operation, r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + kill_operation = state_to_kill_operation(s, state); + r = unit_kill_context( + UNIT(s), + &s->kill_context, + kill_operation, + s->main_pid, + s->control_pid, + s->main_pid_alien); + if (r < 0) + goto fail; + + if (r > 0) { + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), + kill_operation == KILL_WATCHDOG ? service_timeout_abort_usec(s) : s->timeout_stop_usec)); + if (r < 0) + goto fail; + + service_set_state(s, state); + } else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM) && s->kill_context.send_sigkill) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS); + else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL)) + service_enter_stop_post(s, SERVICE_SUCCESS); + else if (IN_SET(state, SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM) && s->kill_context.send_sigkill) + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS); + else + service_enter_dead(s, SERVICE_SUCCESS, true); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + + if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL)) + service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES); + else + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); +} + +static void service_enter_stop_by_notify(Service *s) { + assert(s); + + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec)); + + /* The service told us it's stopping, so it's as if we SIGTERM'd it. */ + service_set_state(s, SERVICE_STOP_SIGTERM); +} + +static void service_enter_stop(Service *s, ServiceResult f) { + int r; + + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + (void) unit_enqueue_rewatch_pids(UNIT(s)); + + s->control_command = s->exec_command[SERVICE_EXEC_STOP]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_STOP; + + r = service_spawn(s, + s->control_command, + s->timeout_stop_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_STOP); + } else + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop' task: %m"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static bool service_good(Service *s) { + int main_pid_ok; + assert(s); + + if (s->type == SERVICE_DBUS && !s->bus_name_good) + return false; + + main_pid_ok = main_pid_good(s); + if (main_pid_ok > 0) /* It's alive */ + return true; + if (main_pid_ok == 0) /* It's dead */ + return false; + + /* OK, we don't know anything about the main PID, maybe + * because there is none. Let's check the control group + * instead. */ + + return cgroup_good(s) != 0; +} + +static void service_enter_running(Service *s, ServiceResult f) { + assert(s); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + service_unwatch_control_pid(s); + + if (s->result != SERVICE_SUCCESS) + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + else if (service_good(s)) { + + /* If there are any queued up sd_notify() notifications, process them now */ + if (s->notify_state == NOTIFY_RELOADING) + service_enter_reload_by_notify(s); + else if (s->notify_state == NOTIFY_STOPPING) + service_enter_stop_by_notify(s); + else { + service_set_state(s, SERVICE_RUNNING); + service_arm_timer(s, usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec)); + } + + } else if (s->remain_after_exit) + service_set_state(s, SERVICE_EXITED); + else + service_enter_stop(s, SERVICE_SUCCESS); +} + +static void service_enter_start_post(Service *s) { + int r; + assert(s); + + service_unwatch_control_pid(s); + service_reset_watchdog(s); + + s->control_command = s->exec_command[SERVICE_EXEC_START_POST]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_START_POST; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_START_POST); + } else + service_enter_running(s, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-post' task: %m"); + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); +} + +static void service_kill_control_process(Service *s) { + int r; + + assert(s); + + if (s->control_pid <= 0) + return; + + r = kill_and_sigcont(s->control_pid, SIGKILL); + if (r < 0) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(s->control_pid, &comm); + + log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", + s->control_pid, strna(comm)); + } +} + +static int service_adverse_to_leftover_processes(Service *s) { + assert(s); + + /* KillMode=mixed and control group are used to indicate that all process should be killed off. + * SendSIGKILL= is used for services that require a clean shutdown. These are typically database + * service where a SigKilled process would result in a lengthy recovery and who's shutdown or startup + * time is quite variable (so Timeout settings aren't of use). + * + * Here we take these two factors and refuse to start a service if there are existing processes + * within a control group. Databases, while generally having some protection against multiple + * instances running, lets not stress the rigor of these. Also ExecStartPre= parts of the service + * aren't as rigoriously written to protect aganst against multiple use. */ + + if (unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start) > 0 && + IN_SET(s->kill_context.kill_mode, KILL_MIXED, KILL_CONTROL_GROUP) && + !s->kill_context.send_sigkill) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EBUSY), + "Will not start SendSIGKILL=no service of type KillMode=control-group or mixed while processes exist"); + + return 0; +} + +static void service_enter_start(Service *s) { + ExecCommand *c; + usec_t timeout; + pid_t pid; + int r; + + assert(s); + + service_unwatch_control_pid(s); + service_unwatch_main_pid(s); + + r = service_adverse_to_leftover_processes(s); + if (r < 0) + goto fail; + + if (s->type == SERVICE_FORKING) { + s->control_command_id = SERVICE_EXEC_START; + c = s->control_command = s->exec_command[SERVICE_EXEC_START]; + + s->main_command = NULL; + } else { + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + s->control_command = NULL; + + c = s->main_command = s->exec_command[SERVICE_EXEC_START]; + } + + if (!c) { + if (s->type != SERVICE_ONESHOT) { + /* There's no command line configured for the main command? Hmm, that is strange. + * This can only happen if the configuration changes at runtime. In this case, + * let's enter a failure state. */ + log_unit_error(UNIT(s), "There's no 'start' task anymore we could start."); + r = -ENXIO; + goto fail; + } + + /* We force a fake state transition here. Otherwise, the unit would go directly from + * SERVICE_DEAD to SERVICE_DEAD without SERVICE_ACTIVATING or SERVICE_ACTIVE + * in between. This way we can later trigger actions that depend on the state + * transition, including SuccessAction=. */ + service_set_state(s, SERVICE_START); + + service_enter_start_post(s); + return; + } + + if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) + /* For simple + idle this is the main process. We don't apply any timeout here, but + * service_enter_running() will later apply the .runtime_max_usec timeout. */ + timeout = USEC_INFINITY; + else + timeout = s->timeout_start_usec; + + r = service_spawn(s, + c, + timeout, + EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS, + &pid); + if (r < 0) + goto fail; + + if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) { + /* For simple services we immediately start + * the START_POST binaries. */ + + service_set_main_pid(s, pid); + service_enter_start_post(s); + + } else if (s->type == SERVICE_FORKING) { + + /* For forking services we wait until the start + * process exited. */ + + s->control_pid = pid; + service_set_state(s, SERVICE_START); + + } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) { + + /* For oneshot services we wait until the start process exited, too, but it is our main process. */ + + /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the + * bus. 'notify' and 'exec' services are similar. */ + + service_set_main_pid(s, pid); + service_set_state(s, SERVICE_START); + } else + assert_not_reached("Unknown service type"); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start' task: %m"); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); +} + +static void service_enter_start_pre(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + + s->control_command = s->exec_command[SERVICE_EXEC_START_PRE]; + if (s->control_command) { + + r = service_adverse_to_leftover_processes(s); + if (r < 0) + goto fail; + + s->control_command_id = SERVICE_EXEC_START_PRE; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_START_PRE); + } else + service_enter_start(s); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-pre' task: %m"); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); +} + +static void service_enter_condition(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + + s->control_command = s->exec_command[SERVICE_EXEC_CONDITION]; + if (s->control_command) { + + r = service_adverse_to_leftover_processes(s); + if (r < 0) + goto fail; + + s->control_command_id = SERVICE_EXEC_CONDITION; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN, + &s->control_pid); + + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_CONDITION); + } else + service_enter_start_pre(s); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'exec-condition' task: %m"); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); +} + +static void service_enter_restart(Service *s) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + if (unit_has_job_type(UNIT(s), JOB_STOP)) { + /* Don't restart things if we are going down anyway */ + log_unit_info(UNIT(s), "Stop job pending for unit, delaying automatic restart."); + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->restart_usec)); + if (r < 0) + goto fail; + + return; + } + + /* Any units that are bound to this service must also be + * restarted. We use JOB_RESTART (instead of the more obvious + * JOB_START) here so that those dependency jobs will be added + * as well. */ + r = manager_add_job(UNIT(s)->manager, JOB_RESTART, UNIT(s), JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto fail; + + /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't fully + * stopped, i.e. as long as it remains up or remains in auto-start states. The use can reset the counter + * explicitly however via the usual "systemctl reset-failure" logic. */ + s->n_restarts ++; + s->flush_n_restarts = false; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR, + LOG_UNIT_ID(UNIT(s)), + LOG_UNIT_INVOCATION_ID(UNIT(s)), + LOG_UNIT_MESSAGE(UNIT(s), "Scheduled restart job, restart counter is at %u.", s->n_restarts), + "N_RESTARTS=%u", s->n_restarts); + + /* Notify clients about changed restart counter */ + unit_add_to_dbus_queue(UNIT(s)); + + /* Note that we stay in the SERVICE_AUTO_RESTART state here, + * it will be canceled as part of the service_stop() call that + * is executed as part of JOB_RESTART. */ + + return; + +fail: + log_unit_warning(UNIT(s), "Failed to schedule restart job: %s", bus_error_message(&error, r)); + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, false); +} + +static void service_enter_reload_by_notify(Service *s) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_start_usec)); + service_set_state(s, SERVICE_RELOAD); + + /* service_enter_reload_by_notify is never called during a reload, thus no loops are possible. */ + r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error); + if (r < 0) + log_unit_warning(UNIT(s), "Failed to schedule propagation of reload: %s", bus_error_message(&error, r)); +} + +static void service_enter_reload(Service *s) { + int r; + + assert(s); + + service_unwatch_control_pid(s); + s->reload_result = SERVICE_SUCCESS; + + s->control_command = s->exec_command[SERVICE_EXEC_RELOAD]; + if (s->control_command) { + s->control_command_id = SERVICE_EXEC_RELOAD; + + r = service_spawn(s, + s->control_command, + s->timeout_start_usec, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, + &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_RELOAD); + } else + service_enter_running(s, SERVICE_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'reload' task: %m"); + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); +} + +static void service_run_next_control(Service *s) { + usec_t timeout; + int r; + + assert(s); + assert(s->control_command); + assert(s->control_command->command_next); + + assert(s->control_command_id != SERVICE_EXEC_START); + + s->control_command = s->control_command->command_next; + service_unwatch_control_pid(s); + + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + timeout = s->timeout_start_usec; + else + timeout = s->timeout_stop_usec; + + r = service_spawn(s, + s->control_command, + timeout, + EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL| + (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)| + (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0), + &s->control_pid); + if (r < 0) + goto fail; + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run next control task: %m"); + + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START_POST, SERVICE_STOP)) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + else if (s->state == SERVICE_STOP_POST) + service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); + else if (s->state == SERVICE_RELOAD) { + s->reload_result = SERVICE_FAILURE_RESOURCES; + service_enter_running(s, SERVICE_SUCCESS); + } else + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); +} + +static void service_run_next_main(Service *s) { + pid_t pid; + int r; + + assert(s); + assert(s->main_command); + assert(s->main_command->command_next); + assert(s->type == SERVICE_ONESHOT); + + s->main_command = s->main_command->command_next; + service_unwatch_main_pid(s); + + r = service_spawn(s, + s->main_command, + s->timeout_start_usec, + EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG, + &pid); + if (r < 0) + goto fail; + + service_set_main_pid(s, pid); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run next main task: %m"); + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); +} + +static int service_start(Unit *u) { + Service *s = SERVICE(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(s->state, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, SERVICE_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST)) + return 0; + + /* A service that will be restarted must be stopped first to + * trigger BindsTo and/or OnFailure dependencies. If a user + * does not want to wait for the holdoff time to elapse, the + * service should be manually restarted, not started. We + * simply return EAGAIN here, so that any start jobs stay + * queued, and assume that the auto restart timer will + * eventually trigger the restart. */ + if (s->state == SERVICE_AUTO_RESTART) + return -EAGAIN; + + assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED)); + + /* Make sure we don't enter a busy loop of some kind. */ + r = unit_test_start_limit(u); + if (r < 0) { + service_enter_dead(s, SERVICE_FAILURE_START_LIMIT_HIT, false); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + s->result = SERVICE_SUCCESS; + s->reload_result = SERVICE_SUCCESS; + s->main_pid_known = false; + s->main_pid_alien = false; + s->forbid_restart = false; + + s->status_text = mfree(s->status_text); + s->status_errno = 0; + + s->notify_state = NOTIFY_UNKNOWN; + + s->watchdog_original_usec = s->watchdog_usec; + s->watchdog_override_enable = false; + s->watchdog_override_usec = USEC_INFINITY; + + exec_command_reset_status_list_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); + exec_status_reset(&s->main_exec_status); + + /* This is not an automatic restart? Flush the restart counter then */ + if (s->flush_n_restarts) { + s->n_restarts = 0; + s->flush_n_restarts = false; + } + + u->reset_accounting = true; + + service_enter_condition(s); + return 1; +} + +static int service_stop(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* Don't create restart jobs from manual stops. */ + s->forbid_restart = true; + + /* Already on it */ + if (IN_SET(s->state, + SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) + return 0; + + /* A restart will be scheduled or is in progress. */ + if (s->state == SERVICE_AUTO_RESTART) { + service_set_state(s, SERVICE_DEAD); + return 0; + } + + /* If there's already something running we go directly into + * kill mode. */ + if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RELOAD, SERVICE_STOP_WATCHDOG)) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS); + return 0; + } + + /* If we are currently cleaning, then abort it, brutally. */ + if (s->state == SERVICE_CLEANING) { + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS); + return 0; + } + + assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED)); + + service_enter_stop(s, SERVICE_SUCCESS); + return 1; +} + +static int service_reload(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED)); + + service_enter_reload(s); + return 1; +} + +_pure_ static bool service_can_reload(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return !!s->exec_command[SERVICE_EXEC_RELOAD]; +} + +static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, ExecCommand *current) { + Service *s = SERVICE(u); + unsigned idx = 0; + ExecCommand *first, *c; + + assert(s); + assert(id >= 0); + assert(id < _SERVICE_EXEC_COMMAND_MAX); + + first = s->exec_command[id]; + + /* Figure out where we are in the list by walking back to the beginning */ + for (c = current; c != first; c = c->command_prev) + idx++; + + return idx; +} + +static int service_serialize_exec_command(Unit *u, FILE *f, ExecCommand *command) { + _cleanup_free_ char *args = NULL, *p = NULL; + size_t allocated = 0, length = 0; + Service *s = SERVICE(u); + const char *type, *key; + ServiceExecCommand id; + unsigned idx; + char **arg; + + assert(s); + assert(f); + + if (!command) + return 0; + + if (command == s->control_command) { + type = "control"; + id = s->control_command_id; + } else { + type = "main"; + id = SERVICE_EXEC_START; + } + + idx = service_exec_command_index(u, id, command); + + STRV_FOREACH(arg, command->argv) { + _cleanup_free_ char *e = NULL; + size_t n; + + e = cescape(*arg); + if (!e) + return log_oom(); + + n = strlen(e); + if (!GREEDY_REALLOC(args, allocated, length + 2 + n + 2)) + return log_oom(); + + if (length > 0) + args[length++] = ' '; + + args[length++] = '"'; + memcpy(args + length, e, n); + length += n; + args[length++] = '"'; + } + + if (!GREEDY_REALLOC(args, allocated, length + 1)) + return log_oom(); + + args[length++] = 0; + + p = cescape(command->path); + if (!p) + return log_oom(); + + key = strjoina(type, "-command"); + (void) serialize_item_format(f, key, "%s %u %s %s", service_exec_command_to_string(id), idx, p, args); + + return 0; +} + +static int service_serialize(Unit *u, FILE *f, FDSet *fds) { + Service *s = SERVICE(u); + ServiceFDStore *fs; + int r; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", service_state_to_string(s->state)); + (void) serialize_item(f, "result", service_result_to_string(s->result)); + (void) serialize_item(f, "reload-result", service_result_to_string(s->reload_result)); + + if (s->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid); + + if (s->main_pid_known && s->main_pid > 0) + (void) serialize_item_format(f, "main-pid", PID_FMT, s->main_pid); + + (void) serialize_bool(f, "main-pid-known", s->main_pid_known); + (void) serialize_bool(f, "bus-name-good", s->bus_name_good); + (void) serialize_bool(f, "bus-name-owner", s->bus_name_owner); + + (void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts); + (void) serialize_bool(f, "flush-n-restarts", s->flush_n_restarts); + + r = serialize_item_escaped(f, "status-text", s->status_text); + if (r < 0) + return r; + + service_serialize_exec_command(u, f, s->control_command); + service_serialize_exec_command(u, f, s->main_command); + + r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd); + if (r < 0) + return r; + + if (s->exec_fd_event_source) { + r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source)); + if (r < 0) + return r; + + (void) serialize_bool(f, "exec-fd-hot", s->exec_fd_hot); + } + + if (UNIT_ISSET(s->accept_socket)) { + r = serialize_item(f, "accept-socket", UNIT_DEREF(s->accept_socket)->id); + if (r < 0) + return r; + } + + r = serialize_fd(f, fds, "socket-fd", s->socket_fd); + if (r < 0) + return r; + + LIST_FOREACH(fd_store, fs, s->fd_store) { + _cleanup_free_ char *c = NULL; + int copy; + + copy = fdset_put_dup(fds, fs->fd); + if (copy < 0) + return log_error_errno(copy, "Failed to copy file descriptor for serialization: %m"); + + c = cescape(fs->fdname); + if (!c) + return log_oom(); + + (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll); + } + + if (s->main_exec_status.pid > 0) { + (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid); + (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp); + (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp); + + if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { + (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code); + (void) serialize_item_format(f, "main-exec-status-status", "%i", s->main_exec_status.status); + } + } + + (void) serialize_dual_timestamp(f, "watchdog-timestamp", &s->watchdog_timestamp); + (void) serialize_bool(f, "forbid-restart", s->forbid_restart); + + if (s->watchdog_override_enable) + (void) serialize_item_format(f, "watchdog-override-usec", USEC_FMT, s->watchdog_override_usec); + + if (s->watchdog_original_usec != USEC_INFINITY) + (void) serialize_item_format(f, "watchdog-original-usec", USEC_FMT, s->watchdog_original_usec); + + return 0; +} + +static int service_deserialize_exec_command( + Unit *u, + const char *key, + const char *value) { + + Service *s = SERVICE(u); + int r; + unsigned idx = 0, i; + bool control, found = false; + ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID; + ExecCommand *command = NULL; + _cleanup_free_ char *path = NULL; + _cleanup_strv_free_ char **argv = NULL; + + enum ExecCommandState { + STATE_EXEC_COMMAND_TYPE, + STATE_EXEC_COMMAND_INDEX, + STATE_EXEC_COMMAND_PATH, + STATE_EXEC_COMMAND_ARGS, + _STATE_EXEC_COMMAND_MAX, + _STATE_EXEC_COMMAND_INVALID = -1, + } state; + + assert(s); + assert(key); + assert(value); + + control = streq(key, "control-command"); + + state = STATE_EXEC_COMMAND_TYPE; + + for (;;) { + _cleanup_free_ char *arg = NULL; + + r = extract_first_word(&value, &arg, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE); + if (r < 0) + return r; + if (r == 0) + break; + + switch (state) { + case STATE_EXEC_COMMAND_TYPE: + id = service_exec_command_from_string(arg); + if (id < 0) + return -EINVAL; + + state = STATE_EXEC_COMMAND_INDEX; + break; + case STATE_EXEC_COMMAND_INDEX: + r = safe_atou(arg, &idx); + if (r < 0) + return -EINVAL; + + state = STATE_EXEC_COMMAND_PATH; + break; + case STATE_EXEC_COMMAND_PATH: + path = TAKE_PTR(arg); + state = STATE_EXEC_COMMAND_ARGS; + + if (!path_is_absolute(path)) + return -EINVAL; + break; + case STATE_EXEC_COMMAND_ARGS: + r = strv_extend(&argv, arg); + if (r < 0) + return -ENOMEM; + break; + default: + assert_not_reached("Unknown error at deserialization of exec command"); + break; + } + } + + if (state != STATE_EXEC_COMMAND_ARGS) + return -EINVAL; + + /* Let's check whether exec command on given offset matches data that we just deserialized */ + for (command = s->exec_command[id], i = 0; command; command = command->command_next, i++) { + if (i != idx) + continue; + + found = strv_equal(argv, command->argv) && streq(command->path, path); + break; + } + + if (!found) { + /* Command at the index we serialized is different, let's look for command that exactly + * matches but is on different index. If there is no such command we will not resume execution. */ + for (command = s->exec_command[id]; command; command = command->command_next) + if (strv_equal(command->argv, argv) && streq(command->path, path)) + break; + } + + if (command && control) { + s->control_command = command; + s->control_command_id = id; + } else if (command) + s->main_command = command; + else + log_unit_warning(u, "Current command vanished from the unit file, execution of the command list won't be resumed."); + + return 0; +} + +static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Service *s = SERVICE(u); + int r; + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + ServiceState state; + + state = service_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + ServiceResult f; + + f = service_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SERVICE_SUCCESS) + s->result = f; + + } else if (streq(key, "reload-result")) { + ServiceResult f; + + f = service_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse reload result value: %s", value); + else if (f != SERVICE_SUCCESS) + s->reload_result = f; + + } else if (streq(key, "control-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + else + s->control_pid = pid; + } else if (streq(key, "main-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse main-pid value: %s", value); + else + (void) service_set_main_pid(s, pid); + } else if (streq(key, "main-pid-known")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse main-pid-known value: %s", value); + else + s->main_pid_known = b; + } else if (streq(key, "bus-name-good")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse bus-name-good value: %s", value); + else + s->bus_name_good = b; + } else if (streq(key, "bus-name-owner")) { + r = free_and_strdup(&s->bus_name_owner, value); + if (r < 0) + log_unit_error_errno(u, r, "Unable to deserialize current bus owner %s: %m", value); + } else if (streq(key, "status-text")) { + char *t; + + r = cunescape(value, 0, &t); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to unescape status text '%s': %m", value); + else + free_and_replace(s->status_text, t); + + } else if (streq(key, "accept-socket")) { + Unit *socket; + + r = manager_load_unit(u->manager, value, NULL, NULL, &socket); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to load accept-socket unit '%s': %m", value); + else { + unit_ref_set(&s->accept_socket, u, socket); + SOCKET(socket)->n_connections++; + } + + } else if (streq(key, "socket-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse socket-fd value: %s", value); + else { + asynchronous_close(s->socket_fd); + s->socket_fd = fdset_remove(fds, fd); + } + } else if (streq(key, "fd-store-fd")) { + _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL; + int fd; + int do_poll; + + r = extract_first_word(&value, &fdv, NULL, 0); + if (r <= 0 || safe_atoi(fdv, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) { + log_unit_debug(u, "Failed to parse fd-store-fd value: %s", value); + return 0; + } + + r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE); + if (r <= 0) { + log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\": %m", value); + return 0; + } + + r = extract_first_word(&value, &fdp, NULL, 0); + if (r == 0) { + /* If the value is not present, we assume the default */ + do_poll = 1; + } else if (r < 0 || safe_atoi(fdp, &do_poll) < 0) { + log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\": %m", value); + return 0; + } + + r = service_add_fd_store(s, fd, fdn, do_poll); + if (r < 0) + log_unit_error_errno(u, r, "Failed to add fd to store: %m"); + else + fdset_remove(fds, fd); + } else if (streq(key, "main-exec-status-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-pid value: %s", value); + else + s->main_exec_status.pid = pid; + } else if (streq(key, "main-exec-status-code")) { + int i; + + if (safe_atoi(value, &i) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-code value: %s", value); + else + s->main_exec_status.code = i; + } else if (streq(key, "main-exec-status-status")) { + int i; + + if (safe_atoi(value, &i) < 0) + log_unit_debug(u, "Failed to parse main-exec-status-status value: %s", value); + else + s->main_exec_status.status = i; + } else if (streq(key, "main-exec-status-start")) + deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp); + else if (streq(key, "main-exec-status-exit")) + deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp); + else if (streq(key, "watchdog-timestamp")) + deserialize_dual_timestamp(value, &s->watchdog_timestamp); + else if (streq(key, "forbid-restart")) { + int b; + + b = parse_boolean(value); + if (b < 0) + log_unit_debug(u, "Failed to parse forbid-restart value: %s", value); + else + s->forbid_restart = b; + } else if (streq(key, "stdin-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse stdin-fd value: %s", value); + else { + asynchronous_close(s->stdin_fd); + s->stdin_fd = fdset_remove(fds, fd); + s->exec_context.stdio_as_fds = true; + } + } else if (streq(key, "stdout-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse stdout-fd value: %s", value); + else { + asynchronous_close(s->stdout_fd); + s->stdout_fd = fdset_remove(fds, fd); + s->exec_context.stdio_as_fds = true; + } + } else if (streq(key, "stderr-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse stderr-fd value: %s", value); + else { + asynchronous_close(s->stderr_fd); + s->stderr_fd = fdset_remove(fds, fd); + s->exec_context.stdio_as_fds = true; + } + } else if (streq(key, "exec-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse exec-fd value: %s", value); + else { + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + fd = fdset_remove(fds, fd); + if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0) + safe_close(fd); + } + } else if (streq(key, "watchdog-override-usec")) { + if (deserialize_usec(value, &s->watchdog_override_usec) < 0) + log_unit_debug(u, "Failed to parse watchdog_override_usec value: %s", value); + else + s->watchdog_override_enable = true; + + } else if (streq(key, "watchdog-original-usec")) { + if (deserialize_usec(value, &s->watchdog_original_usec) < 0) + log_unit_debug(u, "Failed to parse watchdog_original_usec value: %s", value); + + } else if (STR_IN_SET(key, "main-command", "control-command")) { + r = service_deserialize_exec_command(u, key, value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized command \"%s\": %m", value); + + } else if (streq(key, "n-restarts")) { + r = safe_atou(value, &s->n_restarts); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value); + + } else if (streq(key, "flush-n-restarts")) { + r = parse_boolean(value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to parse serialized flush restart counter setting '%s': %m", value); + else + s->flush_n_restarts = r; + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState service_active_state(Unit *u) { + const UnitActiveState *table; + + assert(u); + + table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + + return table[SERVICE(u)->state]; +} + +static const char *service_sub_state_to_string(Unit *u) { + assert(u); + + return service_state_to_string(SERVICE(u)->state); +} + +static bool service_may_gc(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* Never clean up services that still have a process around, even if the service is formally dead. Note that + * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they + * have moved outside of the cgroup. */ + + if (main_pid_good(s) > 0 || + control_pid_good(s) > 0) + return false; + + return true; +} + +static int service_retry_pid_file(Service *s) { + int r; + + assert(s->pid_file); + assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); + + r = service_load_pid_file(s, false); + if (r < 0) + return r; + + service_unwatch_pid_file(s); + + service_enter_running(s, SERVICE_SUCCESS); + return 0; +} + +static int service_watch_pid_file(Service *s) { + int r; + + log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path); + + r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io); + if (r < 0) + goto fail; + + /* the pidfile might have appeared just before we set the watch */ + log_unit_debug(UNIT(s), "Trying to read PID file %s in case it changed", s->pid_file_pathspec->path); + service_retry_pid_file(s); + + return 0; +fail: + log_unit_error_errno(UNIT(s), r, "Failed to set a watch for PID file %s: %m", s->pid_file_pathspec->path); + service_unwatch_pid_file(s); + return r; +} + +static int service_demand_pid_file(Service *s) { + PathSpec *ps; + + assert(s->pid_file); + assert(!s->pid_file_pathspec); + + ps = new0(PathSpec, 1); + if (!ps) + return -ENOMEM; + + ps->unit = UNIT(s); + ps->path = strdup(s->pid_file); + if (!ps->path) { + free(ps); + return -ENOMEM; + } + + path_simplify(ps->path, false); + + /* PATH_CHANGED would not be enough. There are daemons (sendmail) that + * keep their PID file open all the time. */ + ps->type = PATH_MODIFIED; + ps->inotify_fd = -1; + + s->pid_file_pathspec = ps; + + return service_watch_pid_file(s); +} + +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + PathSpec *p = userdata; + Service *s; + + assert(p); + + s = SERVICE(p->unit); + + assert(s); + assert(fd >= 0); + assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); + assert(s->pid_file_pathspec); + assert(path_spec_owns_inotify_fd(s->pid_file_pathspec, fd)); + + log_unit_debug(UNIT(s), "inotify event"); + + if (path_spec_fd_event(p, events) < 0) + goto fail; + + if (service_retry_pid_file(s) == 0) + return 0; + + if (service_watch_pid_file(s) < 0) + goto fail; + + return 0; + +fail: + service_unwatch_pid_file(s); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); + return 0; +} + +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + + log_unit_debug(UNIT(s), "got exec-fd event"); + + /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() + * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically + * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the + * parent. We need to be careful however, as there are other reasons that we might cause the child's side of + * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless + * the child signalled us first that it is about to call the execve(). It does so by sending us a simple + * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it + * sends a zero byte we'll ignore POLLHUP on the fd again. */ + + for (;;) { + uint8_t x; + ssize_t n; + + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for */ + + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ + log_unit_debug(UNIT(s), "Got EOF on exec-fd"); + + s->exec_fd_hot = false; + + /* Nice! This is what we have been waiting for. Transition to next state. */ + if (s->type == SERVICE_EXEC && s->state == SERVICE_START) + service_enter_start_post(s); + } else + log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring."); + + return 0; + } + + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); + s->exec_fd_hot = x; + } + + return 0; +} + +static void service_notify_cgroup_empty_event(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + log_unit_debug(u, "Control group is empty."); + + switch (s->state) { + + /* Waiting for SIGCHLD is usually more interesting, + * because it includes return codes/signals. Which is + * why we ignore the cgroup events for most cases, + * except when we don't know pid which to expect the + * SIGCHLD for. */ + + case SERVICE_START: + if (s->type == SERVICE_NOTIFY && + main_pid_good(s) == 0 && + control_pid_good(s) == 0) { + /* No chance of getting a ready notification anymore */ + service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL); + break; + } + + _fallthrough_; + case SERVICE_START_POST: + if (s->pid_file_pathspec && + main_pid_good(s) == 0 && + control_pid_good(s) == 0) { + + /* Give up hoping for the daemon to write its PID file */ + log_unit_warning(u, "Daemon never wrote its PID file. Failing."); + + service_unwatch_pid_file(s); + if (s->state == SERVICE_START) + service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL); + else + service_enter_stop(s, SERVICE_FAILURE_PROTOCOL); + } + break; + + case SERVICE_RUNNING: + /* service_enter_running() will figure out what to do */ + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0) + service_enter_stop_post(s, SERVICE_SUCCESS); + + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0) + service_enter_dead(s, SERVICE_SUCCESS, true); + + break; + + /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean + * up the cgroup earlier and should do it now. */ + case SERVICE_DEAD: + case SERVICE_FAILED: + unit_prune_cgroup(u); + break; + + default: + ; + } +} + +static void service_notify_cgroup_oom_event(Unit *u) { + Service *s = SERVICE(u); + + log_unit_debug(u, "Process of control group was killed by the OOM killer."); + + if (s->oom_policy == OOM_CONTINUE) + return; + + switch (s->state) { + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + case SERVICE_STOP: + if (s->oom_policy == OOM_STOP) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL); + else if (s->oom_policy == OOM_KILL) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + + break; + + case SERVICE_EXITED: + case SERVICE_RUNNING: + if (s->oom_policy == OOM_STOP) + service_enter_stop(s, SERVICE_FAILURE_OOM_KILL); + else if (s->oom_policy == OOM_KILL) + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL); + break; + + case SERVICE_STOP_SIGKILL: + case SERVICE_FINAL_SIGKILL: + if (s->result == SERVICE_SUCCESS) + s->result = SERVICE_FAILURE_OOM_KILL; + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL); + break; + + default: + ; + } +} + +static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { + bool notify_dbus = true; + Service *s = SERVICE(u); + ServiceResult f; + ExitClean clean_mode; + + assert(s); + assert(pid >= 0); + + /* Oneshot services and non-SERVICE_EXEC_START commands should not be + * considered daemons as they are typically not long running. */ + if (s->type == SERVICE_ONESHOT || (s->control_pid == pid && s->control_command_id != SERVICE_EXEC_START)) + clean_mode = EXIT_CLEAN_COMMAND; + else + clean_mode = EXIT_CLEAN_DAEMON; + + if (is_clean_exit(code, status, clean_mode, &s->success_status)) + f = SERVICE_SUCCESS; + else if (code == CLD_EXITED) + f = SERVICE_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SERVICE_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SERVICE_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown code"); + + if (s->main_pid == pid) { + /* Forking services may occasionally move to a new PID. + * As long as they update the PID file before exiting the old + * PID, they're fine. */ + if (service_load_pid_file(s, false) > 0) + return; + + s->main_pid = 0; + exec_status_exit(&s->main_exec_status, &s->exec_context, pid, code, status); + + if (s->main_command) { + /* If this is not a forking service than the + * main process got started and hence we copy + * the exit status so that it is recorded both + * as main and as control process exit + * status */ + + s->main_command->exec_status = s->main_exec_status; + + if (s->main_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } else if (s->exec_command[SERVICE_EXEC_START]) { + + /* If this is a forked process, then we should + * ignore the return value if this was + * configured for the starter process */ + + if (s->exec_command[SERVICE_EXEC_START]->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } + + unit_log_process_exit( + u, + "Main process", + service_exec_command_to_string(SERVICE_EXEC_START), + f == SERVICE_SUCCESS, + code, status); + + if (s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->main_command && + s->main_command->command_next && + s->type == SERVICE_ONESHOT && + f == SERVICE_SUCCESS) { + + /* There is another command to * + * execute, so let's do that. */ + + log_unit_debug(u, "Running next main command for state %s.", service_state_to_string(s->state)); + service_run_next_main(s); + + } else { + + /* The service exited, so the service is officially + * gone. */ + s->main_command = NULL; + + switch (s->state) { + + case SERVICE_START_POST: + case SERVICE_RELOAD: + case SERVICE_STOP: + /* Need to wait until the operation is + * done */ + break; + + case SERVICE_START: + if (s->type == SERVICE_ONESHOT) { + /* This was our main goal, so let's go on */ + if (f == SERVICE_SUCCESS) + service_enter_start_post(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } else if (s->type == SERVICE_NOTIFY) { + /* Only enter running through a notification, so that the + * SERVICE_START state signifies that no ready notification + * has been received */ + if (f != SERVICE_SUCCESS) + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + else if (!s->remain_after_exit || s->notify_access == NOTIFY_MAIN) + /* The service has never been and will never be active */ + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL); + break; + } + + _fallthrough_; + case SERVICE_RUNNING: + service_enter_running(s, f); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (control_pid_good(s) <= 0) + service_enter_stop_post(s, f); + + /* If there is still a control process, wait for that first */ + break; + + case SERVICE_STOP_POST: + + if (control_pid_good(s) <= 0) + service_enter_signal(s, SERVICE_FINAL_SIGTERM, f); + + break; + + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + + if (control_pid_good(s) <= 0) + service_enter_dead(s, f, true); + break; + + default: + assert_not_reached("Uh, main process died at wrong time."); + } + } + + } else if (s->control_pid == pid) { + s->control_pid = 0; + + /* ExecCondition= calls that exit with (0, 254] should invoke skip-like behavior instead of failing */ + if (f == SERVICE_FAILURE_EXIT_CODE && s->state == SERVICE_CONDITION && status < 255) + f = SERVICE_SKIP_CONDITION; + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SERVICE_SUCCESS; + } + + unit_log_process_exit( + u, + "Control process", + service_exec_command_to_string(s->control_command_id), + f == SERVICE_SUCCESS, + code, status); + + if (s->state != SERVICE_RELOAD && s->result == SERVICE_SUCCESS) + s->result = f; + + if (s->control_command && + s->control_command->command_next && + f == SERVICE_SUCCESS) { + + /* There is another command to * + * execute, so let's do that. */ + + log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state)); + service_run_next_control(s); + + } else { + /* No further commands for this step, so let's + * figure out what to do next */ + + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + log_unit_debug(u, "Got final SIGCHLD for state %s.", service_state_to_string(s->state)); + + switch (s->state) { + + case SERVICE_CONDITION: + if (f == SERVICE_SUCCESS) + service_enter_start_pre(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_START_PRE: + if (f == SERVICE_SUCCESS) + service_enter_start(s); + else + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_START: + if (s->type != SERVICE_FORKING) + /* Maybe spurious event due to a reload that changed the type? */ + break; + + if (f != SERVICE_SUCCESS) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } + + if (s->pid_file) { + bool has_start_post; + int r; + + /* Let's try to load the pid file here if we can. + * The PID file might actually be created by a START_POST + * script. In that case don't worry if the loading fails. */ + + has_start_post = s->exec_command[SERVICE_EXEC_START_POST]; + r = service_load_pid_file(s, !has_start_post); + if (!has_start_post && r < 0) { + r = service_demand_pid_file(s); + if (r < 0 || cgroup_good(s) == 0) + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL); + break; + } + } else + service_search_main_pid(s); + + service_enter_start_post(s); + break; + + case SERVICE_START_POST: + if (f != SERVICE_SUCCESS) { + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + } + + if (s->pid_file) { + int r; + + r = service_load_pid_file(s, true); + if (r < 0) { + r = service_demand_pid_file(s); + if (r < 0 || cgroup_good(s) == 0) + service_enter_stop(s, SERVICE_FAILURE_PROTOCOL); + break; + } + } else + service_search_main_pid(s); + + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_RELOAD: + if (f == SERVICE_SUCCESS) + if (service_load_pid_file(s, true) < 0) + service_search_main_pid(s); + + s->reload_result = f; + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP: + service_enter_signal(s, SERVICE_STOP_SIGTERM, f); + break; + + case SERVICE_STOP_WATCHDOG: + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + if (main_pid_good(s) <= 0) + service_enter_stop_post(s, f); + + /* If there is still a service process around, wait until + * that one quit, too */ + break; + + case SERVICE_STOP_POST: + if (main_pid_good(s) <= 0) + service_enter_signal(s, SERVICE_FINAL_SIGTERM, f); + break; + + case SERVICE_FINAL_WATCHDOG: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0) + service_enter_dead(s, f, true); + break; + + case SERVICE_CLEANING: + + if (s->clean_result == SERVICE_SUCCESS) + s->clean_result = f; + + service_enter_dead(s, SERVICE_SUCCESS, false); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + } + } else /* Neither control nor main PID? If so, don't notify about anything */ + notify_dbus = false; + + /* Notify clients about changed exit status */ + if (notify_dbus) + unit_add_to_dbus_queue(u); + + /* We watch the main/control process otherwise we can't retrieve the unit they + * belong to with cgroupv1. But if they are not our direct child, we won't get a + * SIGCHLD for them. Therefore we need to look for others to watch so we can + * detect when the cgroup becomes empty. Note that the control process is always + * our child so it's pointless to watch all other processes. */ + if (!control_pid_good(s)) + if (!s->main_pid_known || s->main_pid_alien) + (void) unit_enqueue_rewatch_pids(u); +} + +static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + assert(source == s->timer_event_source); + + switch (s->state) { + + case SERVICE_CONDITION: + case SERVICE_START_PRE: + case SERVICE_START: + case SERVICE_START_POST: + switch (s->timeout_start_failure_mode) { + + case SERVICE_TIMEOUT_TERMINATE: + log_unit_warning(UNIT(s), "%s operation timed out. Terminating.", service_state_to_string(s->state)); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_ABORT: + log_unit_warning(UNIT(s), "%s operation timed out. Aborting.", service_state_to_string(s->state)); + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_KILL: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "%s operation timed out. Killing.", service_state_to_string(s->state)); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "%s operation timed out. Skipping SIGKILL.", service_state_to_string(s->state)); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + break; + + default: + assert_not_reached("unknown timeout mode"); + } + break; + + case SERVICE_RUNNING: + log_unit_warning(UNIT(s), "Service reached runtime time limit. Stopping."); + service_enter_stop(s, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_RELOAD: + log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process."); + service_kill_control_process(s); + s->reload_result = SERVICE_FAILURE_TIMEOUT; + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP: + switch (s->timeout_stop_failure_mode) { + + case SERVICE_TIMEOUT_TERMINATE: + log_unit_warning(UNIT(s), "Stopping timed out. Terminating."); + service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_ABORT: + log_unit_warning(UNIT(s), "Stopping timed out. Aborting."); + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_KILL: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + break; + + default: + assert_not_reached("unknown timeout mode"); + } + break; + + case SERVICE_STOP_WATCHDOG: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + break; + + case SERVICE_STOP_SIGTERM: + if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Aborting."); + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + } else if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Killing."); + service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Skipping SIGKILL."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + } + + break; + + case SERVICE_STOP_SIGKILL: + /* Uh, we sent a SIGKILL and it is still not gone? + * Must be something we cannot kill, so let's just be + * weirded out and continue */ + + log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring."); + service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_STOP_POST: + switch (s->timeout_stop_failure_mode) { + + case SERVICE_TIMEOUT_TERMINATE: + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Terminating."); + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_ABORT: + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Aborting."); + service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + break; + + case SERVICE_TIMEOUT_KILL: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'stop-post' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + break; + + default: + assert_not_reached("unknown timeout mode"); + } + break; + + case SERVICE_FINAL_WATCHDOG: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + break; + + case SERVICE_FINAL_SIGTERM: + if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) { + log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Aborting."); + service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT); + } else if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Killing."); + service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Skipping SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false); + } + + break; + + case SERVICE_FINAL_SIGKILL: + log_unit_warning(UNIT(s), "Processes still around after final SIGKILL. Entering failed mode."); + service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, true); + break; + + case SERVICE_AUTO_RESTART: + if (s->restart_usec > 0) { + char buf_restart[FORMAT_TIMESPAN_MAX]; + log_unit_debug(UNIT(s), + "Service RestartSec=%s expired, scheduling restart.", + format_timespan(buf_restart, sizeof buf_restart, s->restart_usec, USEC_PER_SEC)); + } else + log_unit_debug(UNIT(s), + "Service has no hold-off time (RestartSec=0), scheduling restart."); + + service_enter_restart(s); + break; + + case SERVICE_CLEANING: + log_unit_warning(UNIT(s), "Cleaning timed out. killing."); + + if (s->clean_result == SERVICE_SUCCESS) + s->clean_result = SERVICE_FAILURE_TIMEOUT; + + service_enter_signal(s, SERVICE_FINAL_SIGKILL, 0); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) { + Service *s = SERVICE(userdata); + char t[FORMAT_TIMESPAN_MAX]; + usec_t watchdog_usec; + + assert(s); + assert(source == s->watchdog_event_source); + + watchdog_usec = service_get_watchdog_usec(s); + + if (UNIT(s)->manager->service_watchdogs) { + log_unit_error(UNIT(s), "Watchdog timeout (limit %s)!", + format_timespan(t, sizeof(t), watchdog_usec, 1)); + + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); + } else + log_unit_warning(UNIT(s), "Watchdog disabled! Ignoring watchdog timeout (limit %s)!", + format_timespan(t, sizeof(t), watchdog_usec, 1)); + + return 0; +} + +static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) { + assert(s); + + if (s->notify_access == NOTIFY_NONE) { + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid); + return false; + } + + if (s->notify_access == NOTIFY_MAIN && pid != s->main_pid) { + if (s->main_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid); + else + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid); + + return false; + } + + if (s->notify_access == NOTIFY_EXEC && pid != s->main_pid && pid != s->control_pid) { + if (s->main_pid != 0 && s->control_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT, + pid, s->main_pid, s->control_pid); + else if (s->main_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid); + else if (s->control_pid != 0) + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid); + else + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid); + + return false; + } + + return true; +} + +static void service_force_watchdog(Service *s) { + if (!UNIT(s)->manager->service_watchdogs) + return; + + log_unit_error(UNIT(s), "Watchdog request (last status: %s)!", + s->status_text ? s->status_text : "<unset>"); + + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); +} + +static void service_notify_message( + Unit *u, + const struct ucred *ucred, + char * const *tags, + FDSet *fds) { + + Service *s = SERVICE(u); + bool notify_dbus = false; + const char *e; + char * const *i; + int r; + + assert(u); + assert(ucred); + + if (!service_notify_message_authorized(SERVICE(u), ucred->pid, fds)) + return; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *cc = NULL; + + cc = strv_join(tags, ", "); + log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, isempty(cc) ? "n/a" : cc); + } + + /* Interpret MAINPID= */ + e = strv_find_startswith(tags, "MAINPID="); + if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) { + pid_t new_main_pid; + + if (parse_pid(e, &new_main_pid) < 0) + log_unit_warning(u, "Failed to parse MAINPID= field in notification message, ignoring: %s", e); + else if (!s->main_pid_known || new_main_pid != s->main_pid) { + + r = service_is_suitable_main_pid(s, new_main_pid, LOG_WARNING); + if (r == 0) { + /* The new main PID is a bit suspicious, which is OK if the sender is privileged. */ + + if (ucred->uid == 0) { + log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid); + r = 1; + } else + log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid); + } + if (r > 0) { + service_set_main_pid(s, new_main_pid); + + r = unit_watch_pid(UNIT(s), new_main_pid, false); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to watch new main PID "PID_FMT" for service: %m", new_main_pid); + + notify_dbus = true; + } + } + } + + /* Interpret READY=/STOPPING=/RELOADING=. Last one wins. */ + STRV_FOREACH_BACKWARDS(i, tags) { + + if (streq(*i, "READY=1")) { + s->notify_state = NOTIFY_READY; + + /* Type=notify services inform us about completed + * initialization with READY=1 */ + if (s->type == SERVICE_NOTIFY && s->state == SERVICE_START) + service_enter_start_post(s); + + /* Sending READY=1 while we are reloading informs us + * that the reloading is complete */ + if (s->state == SERVICE_RELOAD && s->control_pid == 0) + service_enter_running(s, SERVICE_SUCCESS); + + notify_dbus = true; + break; + + } else if (streq(*i, "RELOADING=1")) { + s->notify_state = NOTIFY_RELOADING; + + if (s->state == SERVICE_RUNNING) + service_enter_reload_by_notify(s); + + notify_dbus = true; + break; + + } else if (streq(*i, "STOPPING=1")) { + s->notify_state = NOTIFY_STOPPING; + + if (s->state == SERVICE_RUNNING) + service_enter_stop_by_notify(s); + + notify_dbus = true; + break; + } + } + + /* Interpret STATUS= */ + e = strv_find_startswith(tags, "STATUS="); + if (e) { + _cleanup_free_ char *t = NULL; + + if (!isempty(e)) { + /* Note that this size limit check is mostly paranoia: since the datagram size we are willing + * to process is already limited to NOTIFY_BUFFER_MAX, this limit here should never be hit. */ + if (strlen(e) > STATUS_TEXT_MAX) + log_unit_warning(u, "Status message overly long (%zu > %u), ignoring.", strlen(e), STATUS_TEXT_MAX); + else if (!utf8_is_valid(e)) + log_unit_warning(u, "Status message in notification message is not UTF-8 clean, ignoring."); + else { + t = strdup(e); + if (!t) + log_oom(); + } + } + + if (!streq_ptr(s->status_text, t)) { + free_and_replace(s->status_text, t); + notify_dbus = true; + } + } + + /* Interpret ERRNO= */ + e = strv_find_startswith(tags, "ERRNO="); + if (e) { + int status_errno; + + status_errno = parse_errno(e); + if (status_errno < 0) + log_unit_warning_errno(u, status_errno, + "Failed to parse ERRNO= field value '%s' in notification message: %m", e); + else if (s->status_errno != status_errno) { + s->status_errno = status_errno; + notify_dbus = true; + } + } + + /* Interpret EXTEND_TIMEOUT= */ + e = strv_find_startswith(tags, "EXTEND_TIMEOUT_USEC="); + if (e) { + usec_t extend_timeout_usec; + if (safe_atou64(e, &extend_timeout_usec) < 0) + log_unit_warning(u, "Failed to parse EXTEND_TIMEOUT_USEC=%s", e); + else + service_extend_timeout(s, extend_timeout_usec); + } + + /* Interpret WATCHDOG= */ + e = strv_find_startswith(tags, "WATCHDOG="); + if (e) { + if (streq(e, "1")) + service_reset_watchdog(s); + else if (streq(e, "trigger")) + service_force_watchdog(s); + else + log_unit_warning(u, "Passed WATCHDOG= field is invalid, ignoring."); + } + + e = strv_find_startswith(tags, "WATCHDOG_USEC="); + if (e) { + usec_t watchdog_override_usec; + if (safe_atou64(e, &watchdog_override_usec) < 0) + log_unit_warning(u, "Failed to parse WATCHDOG_USEC=%s", e); + else + service_override_watchdog_timeout(s, watchdog_override_usec); + } + + /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases, + * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing + * fds, but optional when pushing in new fds, for compatibility reasons. */ + if (strv_find(tags, "FDSTOREREMOVE=1")) { + const char *name; + + name = strv_find_startswith(tags, "FDNAME="); + if (!name || !fdname_is_valid(name)) + log_unit_warning(u, "FDSTOREREMOVE=1 requested, but no valid file descriptor name passed, ignoring."); + else + service_remove_fd_store(s, name); + + } else if (strv_find(tags, "FDSTORE=1")) { + const char *name; + + name = strv_find_startswith(tags, "FDNAME="); + if (name && !fdname_is_valid(name)) { + log_unit_warning(u, "Passed FDNAME= name is invalid, ignoring."); + name = NULL; + } + + (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0")); + } + + /* Notify clients about changed status or main pid */ + if (notify_dbus) + unit_add_to_dbus_queue(u); +} + +static int service_get_timeout(Unit *u, usec_t *timeout) { + Service *s = SERVICE(u); + uint64_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static void service_bus_name_owner_change(Unit *u, const char *new_owner) { + + Service *s = SERVICE(u); + int r; + + assert(s); + + if (new_owner) + log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner); + else + log_unit_debug(u, "D-Bus name %s now not owned by anyone.", s->bus_name); + + s->bus_name_good = new_owner; + + /* Track the current owner, so we can reconstruct changes after a daemon reload */ + r = free_and_strdup(&s->bus_name_owner, new_owner); + if (r < 0) { + log_unit_error_errno(u, r, "Unable to set new bus name owner %s: %m", new_owner); + return; + } + + if (s->type == SERVICE_DBUS) { + + /* service_enter_running() will figure out what to + * do */ + if (s->state == SERVICE_RUNNING) + service_enter_running(s, SERVICE_SUCCESS); + else if (s->state == SERVICE_START && new_owner) + service_enter_start_post(s); + + } else if (new_owner && + s->main_pid <= 0 && + IN_SET(s->state, + SERVICE_START, + SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD)) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + /* Try to acquire PID from bus service */ + + r = sd_bus_get_name_creds(u->manager->api_bus, s->bus_name, SD_BUS_CREDS_PID, &creds); + if (r >= 0) + r = sd_bus_creds_get_pid(creds, &pid); + if (r >= 0) { + log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pid); + + service_set_main_pid(s, pid); + unit_watch_pid(UNIT(s), pid, false); + } + } +} + +int service_set_socket_fd(Service *s, int fd, Socket *sock, bool selinux_context_net) { + _cleanup_free_ char *peer = NULL; + int r; + + assert(s); + assert(fd >= 0); + + /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs + * to be configured. We take ownership of the passed fd on success. */ + + if (UNIT(s)->load_state != UNIT_LOADED) + return -EINVAL; + + if (s->socket_fd >= 0) + return -EBUSY; + + if (s->state != SERVICE_DEAD) + return -EAGAIN; + + if (getpeername_pretty(fd, true, &peer) >= 0) { + + if (UNIT(s)->description) { + _cleanup_free_ char *a; + + a = strjoin(UNIT(s)->description, " (", peer, ")"); + if (!a) + return -ENOMEM; + + r = unit_set_description(UNIT(s), a); + } else + r = unit_set_description(UNIT(s), peer); + + if (r < 0) + return r; + } + + r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + + s->socket_fd = fd; + s->socket_fd_selinux_context_net = selinux_context_net; + + unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock)); + return 0; +} + +static void service_reset_failed(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + if (s->state == SERVICE_FAILED) + service_set_state(s, SERVICE_DEAD); + + s->result = SERVICE_SUCCESS; + s->reload_result = SERVICE_SUCCESS; + s->clean_result = SERVICE_SUCCESS; + s->n_restarts = 0; + s->flush_n_restarts = false; +} + +static int service_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + Service *s = SERVICE(u); + + assert(s); + + return unit_kill_common(u, who, signo, s->main_pid, s->control_pid, error); +} + +static int service_main_pid(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return s->main_pid; +} + +static int service_control_pid(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + return s->control_pid; +} + +static bool service_needs_console(Unit *u) { + Service *s = SERVICE(u); + + assert(s); + + /* We provide our own implementation of this here, instead of relying of the generic implementation + * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */ + + if (!exec_context_may_touch_console(&s->exec_context)) + return false; + + return IN_SET(s->state, + SERVICE_CONDITION, + SERVICE_START_PRE, + SERVICE_START, + SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, + SERVICE_STOP, + SERVICE_STOP_WATCHDOG, + SERVICE_STOP_SIGTERM, + SERVICE_STOP_SIGKILL, + SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, + SERVICE_FINAL_SIGTERM, + SERVICE_FINAL_SIGKILL); +} + +static int service_exit_status(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + if (s->main_exec_status.pid <= 0 || + !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) + return -ENODATA; + + if (s->main_exec_status.code != CLD_EXITED) + return -EBADE; + + return s->main_exec_status.status; +} + +static int service_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Service *s = SERVICE(u); + int r; + + assert(s); + assert(mask != 0); + + if (s->state != SERVICE_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + service_unwatch_control_pid(s); + s->clean_result = SERVICE_SUCCESS; + s->control_command = NULL; + s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; + + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->exec_context.timeout_clean_usec)); + if (r < 0) + goto fail; + + r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); + if (r < 0) + goto fail; + + service_set_state(s, SERVICE_CLEANING); + + return 0; + +fail: + log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m"); + s->clean_result = SERVICE_FAILURE_RESOURCES; + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + return r; +} + +static int service_can_clean(Unit *u, ExecCleanMask *ret) { + Service *s = SERVICE(u); + + assert(s); + + return exec_context_get_clean_mask(&s->exec_context, ret); +} + +static const char *service_finished_job(Unit *u, JobType t, JobResult result) { + if (t == JOB_START && result == JOB_DONE) { + Service *s = SERVICE(u); + + if (s->type == SERVICE_ONESHOT) + return "Finished %s."; + } + + /* Fall back to generic */ + return NULL; +} + +static const char* const service_restart_table[_SERVICE_RESTART_MAX] = { + [SERVICE_RESTART_NO] = "no", + [SERVICE_RESTART_ON_SUCCESS] = "on-success", + [SERVICE_RESTART_ON_FAILURE] = "on-failure", + [SERVICE_RESTART_ON_ABNORMAL] = "on-abnormal", + [SERVICE_RESTART_ON_WATCHDOG] = "on-watchdog", + [SERVICE_RESTART_ON_ABORT] = "on-abort", + [SERVICE_RESTART_ALWAYS] = "always", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart); + +static const char* const service_type_table[_SERVICE_TYPE_MAX] = { + [SERVICE_SIMPLE] = "simple", + [SERVICE_FORKING] = "forking", + [SERVICE_ONESHOT] = "oneshot", + [SERVICE_DBUS] = "dbus", + [SERVICE_NOTIFY] = "notify", + [SERVICE_IDLE] = "idle", + [SERVICE_EXEC] = "exec", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType); + +static const char* const service_exec_command_table[_SERVICE_EXEC_COMMAND_MAX] = { + [SERVICE_EXEC_CONDITION] = "ExecCondition", + [SERVICE_EXEC_START_PRE] = "ExecStartPre", + [SERVICE_EXEC_START] = "ExecStart", + [SERVICE_EXEC_START_POST] = "ExecStartPost", + [SERVICE_EXEC_RELOAD] = "ExecReload", + [SERVICE_EXEC_STOP] = "ExecStop", + [SERVICE_EXEC_STOP_POST] = "ExecStopPost", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_exec_command, ServiceExecCommand); + +static const char* const service_exec_ex_command_table[_SERVICE_EXEC_COMMAND_MAX] = { + [SERVICE_EXEC_CONDITION] = "ExecConditionEx", + [SERVICE_EXEC_START_PRE] = "ExecStartPreEx", + [SERVICE_EXEC_START] = "ExecStartEx", + [SERVICE_EXEC_START_POST] = "ExecStartPostEx", + [SERVICE_EXEC_RELOAD] = "ExecReloadEx", + [SERVICE_EXEC_STOP] = "ExecStopEx", + [SERVICE_EXEC_STOP_POST] = "ExecStopPostEx", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_exec_ex_command, ServiceExecCommand); + +static const char* const notify_state_table[_NOTIFY_STATE_MAX] = { + [NOTIFY_UNKNOWN] = "unknown", + [NOTIFY_READY] = "ready", + [NOTIFY_RELOADING] = "reloading", + [NOTIFY_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP(notify_state, NotifyState); + +static const char* const service_result_table[_SERVICE_RESULT_MAX] = { + [SERVICE_SUCCESS] = "success", + [SERVICE_FAILURE_RESOURCES] = "resources", + [SERVICE_FAILURE_PROTOCOL] = "protocol", + [SERVICE_FAILURE_TIMEOUT] = "timeout", + [SERVICE_FAILURE_EXIT_CODE] = "exit-code", + [SERVICE_FAILURE_SIGNAL] = "signal", + [SERVICE_FAILURE_CORE_DUMP] = "core-dump", + [SERVICE_FAILURE_WATCHDOG] = "watchdog", + [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [SERVICE_FAILURE_OOM_KILL] = "oom-kill", + [SERVICE_SKIP_CONDITION] = "exec-condition", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult); + +static const char* const service_timeout_failure_mode_table[_SERVICE_TIMEOUT_FAILURE_MODE_MAX] = { + [SERVICE_TIMEOUT_TERMINATE] = "terminate", + [SERVICE_TIMEOUT_ABORT] = "abort", + [SERVICE_TIMEOUT_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(service_timeout_failure_mode, ServiceTimeoutFailureMode); + +const UnitVTable service_vtable = { + .object_size = sizeof(Service), + .exec_context_offset = offsetof(Service, exec_context), + .cgroup_context_offset = offsetof(Service, cgroup_context), + .kill_context_offset = offsetof(Service, kill_context), + .exec_runtime_offset = offsetof(Service, exec_runtime), + .dynamic_creds_offset = offsetof(Service, dynamic_creds), + + .sections = + "Unit\0" + "Service\0" + "Install\0", + .private_section = "Service", + + .can_transient = true, + .can_delegate = true, + .can_fail = true, + .can_set_managed_oom = true, + + .init = service_init, + .done = service_done, + .load = service_load, + .release_resources = service_release_resources, + + .coldplug = service_coldplug, + + .dump = service_dump, + + .start = service_start, + .stop = service_stop, + .reload = service_reload, + + .can_reload = service_can_reload, + + .kill = service_kill, + .clean = service_clean, + .can_clean = service_can_clean, + + .freeze = unit_freeze_vtable_common, + .thaw = unit_thaw_vtable_common, + + .serialize = service_serialize, + .deserialize_item = service_deserialize_item, + + .active_state = service_active_state, + .sub_state_to_string = service_sub_state_to_string, + + .will_restart = service_will_restart, + + .may_gc = service_may_gc, + + .sigchld_event = service_sigchld_event, + + .reset_failed = service_reset_failed, + + .notify_cgroup_empty = service_notify_cgroup_empty_event, + .notify_cgroup_oom = service_notify_cgroup_oom_event, + .notify_message = service_notify_message, + + .main_pid = service_main_pid, + .control_pid = service_control_pid, + + .bus_name_owner_change = service_bus_name_owner_change, + + .bus_set_property = bus_service_set_property, + .bus_commit_properties = bus_service_commit_properties, + + .get_timeout = service_get_timeout, + .needs_console = service_needs_console, + .exit_status = service_exit_status, + + .status_message_formats = { + .starting_stopping = { + [0] = "Starting %s...", + [1] = "Stopping %s...", + }, + .finished_start_job = { + [JOB_FAILED] = "Failed to start %s.", + [JOB_SKIPPED] = "Skipped %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Stopped %s.", + [JOB_FAILED] = "Stopped (with error) %s.", + }, + .finished_job = service_finished_job, + }, +}; diff --git a/src/core/service.h b/src/core/service.h new file mode 100644 index 0000000..11c3d3f --- /dev/null +++ b/src/core/service.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Service Service; +typedef struct ServiceFDStore ServiceFDStore; + +#include "exit-status.h" +#include "kill.h" +#include "path.h" +#include "ratelimit.h" +#include "socket.h" +#include "unit.h" + +typedef enum ServiceRestart { + SERVICE_RESTART_NO, + SERVICE_RESTART_ON_SUCCESS, + SERVICE_RESTART_ON_FAILURE, + SERVICE_RESTART_ON_ABNORMAL, + SERVICE_RESTART_ON_WATCHDOG, + SERVICE_RESTART_ON_ABORT, + SERVICE_RESTART_ALWAYS, + _SERVICE_RESTART_MAX, + _SERVICE_RESTART_INVALID = -1 +} ServiceRestart; + +typedef enum ServiceType { + SERVICE_SIMPLE, /* we fork and go on right-away (i.e. modern socket activated daemons) */ + SERVICE_FORKING, /* forks by itself (i.e. traditional daemons) */ + SERVICE_ONESHOT, /* we fork and wait until the program finishes (i.e. programs like fsck which run and need to finish before we continue) */ + SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */ + SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */ + SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */ + SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */ + _SERVICE_TYPE_MAX, + _SERVICE_TYPE_INVALID = -1 +} ServiceType; + +typedef enum ServiceExecCommand { + SERVICE_EXEC_CONDITION, + SERVICE_EXEC_START_PRE, + SERVICE_EXEC_START, + SERVICE_EXEC_START_POST, + SERVICE_EXEC_RELOAD, + SERVICE_EXEC_STOP, + SERVICE_EXEC_STOP_POST, + _SERVICE_EXEC_COMMAND_MAX, + _SERVICE_EXEC_COMMAND_INVALID = -1 +} ServiceExecCommand; + +typedef enum NotifyState { + NOTIFY_UNKNOWN, + NOTIFY_READY, + NOTIFY_RELOADING, + NOTIFY_STOPPING, + _NOTIFY_STATE_MAX, + _NOTIFY_STATE_INVALID = -1 +} NotifyState; + +/* The values of this enum are referenced in man/systemd.exec.xml and src/shared/bus-unit-util.c. + * Update those sources for each change to this enum. */ +typedef enum ServiceResult { + SERVICE_SUCCESS, + SERVICE_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */ + SERVICE_FAILURE_PROTOCOL, + SERVICE_FAILURE_TIMEOUT, + SERVICE_FAILURE_EXIT_CODE, + SERVICE_FAILURE_SIGNAL, + SERVICE_FAILURE_CORE_DUMP, + SERVICE_FAILURE_WATCHDOG, + SERVICE_FAILURE_START_LIMIT_HIT, + SERVICE_FAILURE_OOM_KILL, + SERVICE_SKIP_CONDITION, + _SERVICE_RESULT_MAX, + _SERVICE_RESULT_INVALID = -1 +} ServiceResult; + +typedef enum ServiceTimeoutFailureMode { + SERVICE_TIMEOUT_TERMINATE, + SERVICE_TIMEOUT_ABORT, + SERVICE_TIMEOUT_KILL, + _SERVICE_TIMEOUT_FAILURE_MODE_MAX, + _SERVICE_TIMEOUT_FAILURE_MODE_INVALID = -1 +} ServiceTimeoutFailureMode; + +struct ServiceFDStore { + Service *service; + + int fd; + char *fdname; + sd_event_source *event_source; + bool do_poll; + + LIST_FIELDS(ServiceFDStore, fd_store); +}; + +struct Service { + Unit meta; + + ServiceType type; + ServiceRestart restart; + ExitStatusSet restart_prevent_status; + ExitStatusSet restart_force_status; + ExitStatusSet success_status; + + /* If set we'll read the main daemon PID from this file */ + char *pid_file; + + usec_t restart_usec; + usec_t timeout_start_usec; + usec_t timeout_stop_usec; + usec_t timeout_abort_usec; + bool timeout_abort_set; + usec_t runtime_max_usec; + ServiceTimeoutFailureMode timeout_start_failure_mode; + ServiceTimeoutFailureMode timeout_stop_failure_mode; + + dual_timestamp watchdog_timestamp; + usec_t watchdog_usec; /* the requested watchdog timeout in the unit file */ + usec_t watchdog_original_usec; /* the watchdog timeout that was in effect when the unit was started, i.e. the timeout the forked off processes currently see */ + usec_t watchdog_override_usec; /* the watchdog timeout requested by the service itself through sd_notify() */ + bool watchdog_override_enable; + sd_event_source *watchdog_event_source; + + ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX]; + + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ServiceState state, deserialized_state; + + /* The exit status of the real main process */ + ExecStatus main_exec_status; + + /* The currently executed control process */ + ExecCommand *control_command; + + /* The currently executed main process, which may be NULL if + * the main process got started via forking mode and not by + * us */ + ExecCommand *main_command; + + /* The ID of the control command currently being executed */ + ServiceExecCommand control_command_id; + + /* Runtime data of the execution context */ + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + pid_t main_pid, control_pid; + int socket_fd; + SocketPeer *peer; + bool socket_fd_selinux_context_net; + + bool permissions_start_only; + bool root_directory_start_only; + bool remain_after_exit; + bool guess_main_pid; + + /* If we shut down, remember why */ + ServiceResult result; + ServiceResult reload_result; + ServiceResult clean_result; + + bool main_pid_known:1; + bool main_pid_alien:1; + bool bus_name_good:1; + bool forbid_restart:1; + /* Keep restart intention between UNIT_FAILED and UNIT_ACTIVATING */ + bool will_auto_restart:1; + bool start_timeout_defined:1; + bool exec_fd_hot:1; + + char *bus_name; + char *bus_name_owner; /* unique name of the current owner */ + + char *status_text; + int status_errno; + + UnitRef accept_socket; + + sd_event_source *timer_event_source; + PathSpec *pid_file_pathspec; + + NotifyAccess notify_access; + NotifyState notify_state; + + sd_event_source *exec_fd_event_source; + + ServiceFDStore *fd_store; + size_t n_fd_store; + unsigned n_fd_store_max; + unsigned n_keep_fd_store; + + char *usb_function_descriptors; + char *usb_function_strings; + + int stdin_fd; + int stdout_fd; + int stderr_fd; + + unsigned n_restarts; + bool flush_n_restarts; + + OOMPolicy oom_policy; +}; + +static inline usec_t service_timeout_abort_usec(Service *s) { + assert(s); + return s->timeout_abort_set ? s->timeout_abort_usec : s->timeout_stop_usec; +} + +static inline usec_t service_get_watchdog_usec(Service *s) { + assert(s); + return s->watchdog_override_enable ? s->watchdog_override_usec : s->watchdog_original_usec; +} + +extern const UnitVTable service_vtable; + +int service_set_socket_fd(Service *s, int fd, struct Socket *socket, bool selinux_context_net); +void service_close_socket_fd(Service *s); + +const char* service_restart_to_string(ServiceRestart i) _const_; +ServiceRestart service_restart_from_string(const char *s) _pure_; + +const char* service_type_to_string(ServiceType i) _const_; +ServiceType service_type_from_string(const char *s) _pure_; + +const char* service_exec_command_to_string(ServiceExecCommand i) _const_; +ServiceExecCommand service_exec_command_from_string(const char *s) _pure_; + +const char* service_exec_ex_command_to_string(ServiceExecCommand i) _const_; +ServiceExecCommand service_exec_ex_command_from_string(const char *s) _pure_; + +const char* notify_state_to_string(NotifyState i) _const_; +NotifyState notify_state_from_string(const char *s) _pure_; + +const char* service_result_to_string(ServiceResult i) _const_; +ServiceResult service_result_from_string(const char *s) _pure_; + +const char* service_timeout_failure_mode_to_string(ServiceTimeoutFailureMode i) _const_; +ServiceTimeoutFailureMode service_timeout_failure_mode_from_string(const char *s) _pure_; + +DEFINE_CAST(SERVICE, Service); + +#define STATUS_TEXT_MAX (16U*1024U) diff --git a/src/core/show-status.c b/src/core/show-status.c new file mode 100644 index 0000000..a74423c --- /dev/null +++ b/src/core/show-status.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "alloc-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "parse-util.h" +#include "show-status.h" +#include "string-table.h" +#include "string-util.h" +#include "terminal-util.h" +#include "util.h" + +static const char* const show_status_table[_SHOW_STATUS_MAX] = { + [SHOW_STATUS_NO] = "no", + [SHOW_STATUS_ERROR] = "error", + [SHOW_STATUS_AUTO] = "auto", + [SHOW_STATUS_TEMPORARY] = "temporary", + [SHOW_STATUS_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(show_status, ShowStatus, SHOW_STATUS_YES); + +int parse_show_status(const char *v, ShowStatus *ret) { + ShowStatus s; + + assert(ret); + + s = show_status_from_string(v); + if (s < 0 || s == SHOW_STATUS_TEMPORARY) + return -EINVAL; + + *ret = s; + return 0; +} + +int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) { + static const char status_indent[] = " "; /* "[" STATUS "] " */ + _cleanup_free_ char *s = NULL; + _cleanup_close_ int fd = -1; + struct iovec iovec[7] = {}; + int n = 0; + static bool prev_ephemeral; + + assert(format); + + /* This is independent of logging, as status messages are + * optional and go exclusively to the console. */ + + if (vasprintf(&s, format, ap) < 0) + return log_oom(); + + /* Before you ask: yes, on purpose we open/close the console for each status line we write individually. This + * is a good strategy to avoid PID 1 getting killed by the kernel's SAK concept (it doesn't fix this entirely, + * but minimizes the time window the kernel might end up killing PID 1 due to SAK). It also makes things easier + * for us so that we don't have to recover from hangups and suchlike triggered on the console. */ + + fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return fd; + + if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) { + char *e; + size_t emax, sl; + int c; + + c = fd_columns(fd); + if (c <= 0) + c = 80; + + sl = status ? sizeof(status_indent)-1 : 0; + + emax = c - sl - 1; + if (emax < 3) + emax = 3; + + e = ellipsize(s, emax, 50); + if (e) + free_and_replace(s, e); + } + + if (prev_ephemeral) + iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE); + + if (status) { + if (!isempty(status)) { + iovec[n++] = IOVEC_MAKE_STRING("["); + iovec[n++] = IOVEC_MAKE_STRING(status); + iovec[n++] = IOVEC_MAKE_STRING("] "); + } else + iovec[n++] = IOVEC_MAKE_STRING(status_indent); + } + + iovec[n++] = IOVEC_MAKE_STRING(s); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL)) + iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE); + prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL) ; + + if (writev(fd, iovec, n) < 0) + return -errno; + + return 0; +} + +int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) { + va_list ap; + int r; + + assert(format); + + va_start(ap, format); + r = status_vprintf(status, flags, format, ap); + va_end(ap); + + return r; +} + +static const char* const status_unit_format_table[_STATUS_UNIT_FORMAT_MAX] = { + [STATUS_UNIT_FORMAT_NAME] = "name", + [STATUS_UNIT_FORMAT_DESCRIPTION] = "description", +}; + +DEFINE_STRING_TABLE_LOOKUP(status_unit_format, StatusUnitFormat); diff --git a/src/core/show-status.h b/src/core/show-status.h new file mode 100644 index 0000000..c37ccd9 --- /dev/null +++ b/src/core/show-status.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "macro.h" + +/* Manager status */ + +typedef enum ShowStatus { + SHOW_STATUS_NO, /* printing of status is disabled */ + SHOW_STATUS_ERROR, /* only print errors */ + SHOW_STATUS_AUTO, /* disabled but may flip to _TEMPORARY */ + SHOW_STATUS_TEMPORARY, /* enabled temporarily, may flip back to _AUTO */ + SHOW_STATUS_YES, /* printing of status is enabled */ + _SHOW_STATUS_MAX, + _SHOW_STATUS_INVALID = -1, +} ShowStatus; + +typedef enum ShowStatusFlags { + SHOW_STATUS_ELLIPSIZE = 1 << 0, + SHOW_STATUS_EPHEMERAL = 1 << 1, +} ShowStatusFlags; + +typedef enum StatusUnitFormat { + STATUS_UNIT_FORMAT_NAME, + STATUS_UNIT_FORMAT_DESCRIPTION, + _STATUS_UNIT_FORMAT_MAX, + _STATUS_UNIT_FORMAT_INVALID = -1, +} StatusUnitFormat; + +static inline bool show_status_on(ShowStatus s) { + return IN_SET(s, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES); +} +ShowStatus show_status_from_string(const char *v) _const_; +const char* show_status_to_string(ShowStatus s) _pure_; +int parse_show_status(const char *v, ShowStatus *ret); + +StatusUnitFormat status_unit_format_from_string(const char *v) _const_; +const char* status_unit_format_to_string(StatusUnitFormat s) _pure_; + +int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) _printf_(3,0); +int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) _printf_(3,4); diff --git a/src/core/slice.c b/src/core/slice.c new file mode 100644 index 0000000..ee5c259 --- /dev/null +++ b/src/core/slice.c @@ -0,0 +1,475 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> + +#include "alloc-util.h" +#include "dbus-slice.h" +#include "dbus-unit.h" +#include "fd-util.h" +#include "log.h" +#include "serialize.h" +#include "slice.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = { + [SLICE_DEAD] = UNIT_INACTIVE, + [SLICE_ACTIVE] = UNIT_ACTIVE +}; + +static void slice_init(Unit *u) { + assert(u); + assert(u->load_state == UNIT_STUB); + + u->ignore_on_isolate = true; +} + +static void slice_set_state(Slice *t, SliceState state) { + SliceState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != old_state) + log_debug("%s changed %s -> %s", + UNIT(t)->id, + slice_state_to_string(old_state), + slice_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int slice_add_parent_slice(Slice *s) { + Unit *u = UNIT(s), *parent; + _cleanup_free_ char *a = NULL; + int r; + + assert(s); + + if (UNIT_ISSET(u->slice)) + return 0; + + r = slice_build_parent_slice(u->id, &a); + if (r <= 0) /* 0 means root slice */ + return r; + + r = manager_load_unit(u->manager, a, NULL, NULL, &parent); + if (r < 0) + return r; + + unit_ref_set(&u->slice, u, parent); + return 0; +} + +static int slice_add_default_dependencies(Slice *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + /* Make sure slices are unloaded on shutdown */ + r = unit_add_two_dependencies_by_name( + UNIT(s), + UNIT_BEFORE, UNIT_CONFLICTS, + SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return 0; +} + +static int slice_verify(Slice *s) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (!slice_name_is_valid(UNIT(s)->id)) { + log_unit_error(UNIT(s), "Slice name %s is not valid. Refusing.", UNIT(s)->id); + return -ENOEXEC; + } + + r = slice_build_parent_slice(UNIT(s)->id, &parent); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to determine parent slice: %m"); + + if (parent ? !unit_has_name(UNIT_DEREF(UNIT(s)->slice), parent) : UNIT_ISSET(UNIT(s)->slice)) { + log_unit_error(UNIT(s), "Located outside of parent slice. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int slice_load_root_slice(Unit *u) { + assert(u); + + if (!unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + u->perpetual = true; + + /* The root slice is a bit special. For example it is always running and cannot be terminated. Because of its + * special semantics we synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + if (!u->description) + u->description = strdup("Root Slice"); + if (!u->documentation) + u->documentation = strv_new("man:systemd.special(7)"); + + return 1; +} + +static int slice_load_system_slice(Unit *u) { + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return 0; + if (!unit_has_name(u, SPECIAL_SYSTEM_SLICE)) + return 0; + + u->perpetual = true; + + /* The system slice is a bit special. For example it is always running and cannot be terminated. Because of its + * special semantics we synthesize it here, instead of relying on the unit file on disk. */ + + u->default_dependencies = false; + + if (!u->description) + u->description = strdup("System Slice"); + if (!u->documentation) + u->documentation = strv_new("man:systemd.special(7)"); + + return 1; +} + +static int slice_load(Unit *u) { + Slice *s = SLICE(u); + int r; + + assert(s); + assert(u->load_state == UNIT_STUB); + + r = slice_load_root_slice(u); + if (r < 0) + return r; + r = slice_load_system_slice(u); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin(u, false); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = unit_patch_contexts(u); + if (r < 0) + return r; + + r = slice_add_parent_slice(s); + if (r < 0) + return r; + + r = slice_add_default_dependencies(s); + if (r < 0) + return r; + + return slice_verify(s); +} + +static int slice_coldplug(Unit *u) { + Slice *t = SLICE(u); + + assert(t); + assert(t->state == SLICE_DEAD); + + if (t->deserialized_state != t->state) + slice_set_state(t, t->deserialized_state); + + return 0; +} + +static void slice_dump(Unit *u, FILE *f, const char *prefix) { + Slice *t = SLICE(u); + + assert(t); + assert(f); + + fprintf(f, + "%sSlice State: %s\n", + prefix, slice_state_to_string(t->state)); + + cgroup_context_dump(UNIT(t), f, prefix); +} + +static int slice_start(Unit *u) { + Slice *t = SLICE(u); + int r; + + assert(t); + assert(t->state == SLICE_DEAD); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + (void) unit_realize_cgroup(u); + (void) unit_reset_accounting(u); + + slice_set_state(t, SLICE_ACTIVE); + return 1; +} + +static int slice_stop(Unit *u) { + Slice *t = SLICE(u); + + assert(t); + assert(t->state == SLICE_ACTIVE); + + /* We do not need to destroy the cgroup explicitly, + * unit_notify() will do that for us anyway. */ + + slice_set_state(t, SLICE_DEAD); + return 1; +} + +static int slice_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, -1, error); +} + +static int slice_serialize(Unit *u, FILE *f, FDSet *fds) { + Slice *s = SLICE(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", slice_state_to_string(s->state)); + + return 0; +} + +static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Slice *s = SLICE(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + SliceState state; + + state = slice_state_from_string(value); + if (state < 0) + log_debug("Failed to parse state value %s", value); + else + s->deserialized_state = state; + + } else + log_debug("Unknown serialization key '%s'", key); + + return 0; +} + +_pure_ static UnitActiveState slice_active_state(Unit *u) { + assert(u); + + return state_translation_table[SLICE(u)->state]; +} + +_pure_ static const char *slice_sub_state_to_string(Unit *u) { + assert(u); + + return slice_state_to_string(SLICE(u)->state); +} + +static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) { + Unit *u; + int r; + + assert(m); + assert(name); + + u = manager_get_unit(m, name); + if (!u) { + r = unit_new_for_name(m, sizeof(Slice), name, &u); + if (r < 0) + return log_error_errno(r, "Failed to allocate the special %s unit: %m", name); + } + + u->perpetual = true; + SLICE(u)->deserialized_state = SLICE_ACTIVE; + + unit_add_to_load_queue(u); + unit_add_to_dbus_queue(u); + + if (ret) + *ret = u; + + return 0; +} + +static void slice_enumerate_perpetual(Manager *m) { + Unit *u; + int r; + + assert(m); + + r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u); + if (r >= 0 && manager_owns_host_root_cgroup(m)) { + Slice *s = SLICE(u); + + /* If we are managing the root cgroup then this means our root slice covers the whole system, which + * means the kernel will track CPU/tasks/memory for us anyway, and it is all available in /proc. Let's + * hence turn accounting on here, so that our APIs to query this data are available. */ + + s->cgroup_context.cpu_accounting = true; + s->cgroup_context.tasks_accounting = true; + s->cgroup_context.memory_accounting = true; + } + + if (MANAGER_IS_SYSTEM(m)) + (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL); +} + +static bool slice_freezer_action_supported_by_children(Unit *s) { + Unit *member; + void *v; + + assert(s); + + HASHMAP_FOREACH_KEY(v, member, s->dependencies[UNIT_BEFORE]) { + int r; + + if (UNIT_DEREF(member->slice) != s) + continue; + + if (member->type == UNIT_SLICE) { + r = slice_freezer_action_supported_by_children(member); + if (!r) + return r; + } + + if (!UNIT_VTABLE(member)->freeze) + return false; + } + + return true; +} + +static int slice_freezer_action(Unit *s, FreezerAction action) { + Unit *member; + void *v; + int r; + + assert(s); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + if (!slice_freezer_action_supported_by_children(s)) { + log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice"); + return 0; + } + + HASHMAP_FOREACH_KEY(v, member, s->dependencies[UNIT_BEFORE]) { + if (UNIT_DEREF(member->slice) != s) + continue; + + if (action == FREEZER_FREEZE) + r = UNIT_VTABLE(member)->freeze(member); + else + r = UNIT_VTABLE(member)->thaw(member); + + if (r < 0) + return r; + } + + r = unit_cgroup_freezer_action(s, action); + if (r < 0) + return r; + + return 1; +} + +static int slice_freeze(Unit *s) { + assert(s); + + return slice_freezer_action(s, FREEZER_FREEZE); +} + +static int slice_thaw(Unit *s) { + assert(s); + + return slice_freezer_action(s, FREEZER_THAW); +} + +static bool slice_can_freeze(Unit *s) { + assert(s); + + return slice_freezer_action_supported_by_children(s); +} + +const UnitVTable slice_vtable = { + .object_size = sizeof(Slice), + .cgroup_context_offset = offsetof(Slice, cgroup_context), + + .sections = + "Unit\0" + "Slice\0" + "Install\0", + .private_section = "Slice", + + .can_transient = true, + .can_set_managed_oom = true, + + .init = slice_init, + .load = slice_load, + + .coldplug = slice_coldplug, + + .dump = slice_dump, + + .start = slice_start, + .stop = slice_stop, + + .kill = slice_kill, + + .freeze = slice_freeze, + .thaw = slice_thaw, + .can_freeze = slice_can_freeze, + + .serialize = slice_serialize, + .deserialize_item = slice_deserialize_item, + + .active_state = slice_active_state, + .sub_state_to_string = slice_sub_state_to_string, + + .bus_set_property = bus_slice_set_property, + .bus_commit_properties = bus_slice_commit_properties, + + .enumerate_perpetual = slice_enumerate_perpetual, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Created slice %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Removed slice %s.", + }, + }, +}; diff --git a/src/core/slice.h b/src/core/slice.h new file mode 100644 index 0000000..e2f9274 --- /dev/null +++ b/src/core/slice.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +typedef struct Slice Slice; + +struct Slice { + Unit meta; + + SliceState state, deserialized_state; + + CGroupContext cgroup_context; +}; + +extern const UnitVTable slice_vtable; + +DEFINE_CAST(SLICE, Slice); diff --git a/src/core/smack-setup.c b/src/core/smack-setup.c new file mode 100644 index 0000000..1fe592a --- /dev/null +++ b/src/core/smack-setup.c @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation + Authors: + Nathaniel Chen <nathaniel.chen@intel.com> +***/ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "smack-setup.h" +#include "string-util.h" +#include "util.h" + +#if ENABLE_SMACK + +static int fdopen_unlocked_at(int dfd, const char *dir, const char *name, int *status, FILE **ret_file) { + int fd, r; + FILE *f; + + fd = openat(dfd, name, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (*status == 0) + *status = -errno; + + return log_warning_errno(errno, "Failed to open \"%s/%s\": %m", dir, name); + } + + r = fdopen_unlocked(fd, "r", &f); + if (r < 0) { + if (*status == 0) + *status = r; + + safe_close(fd); + return log_error_errno(r, "Failed to open \"%s/%s\": %m", dir, name); + } + + *ret_file = f; + return 0; +} + +static int write_access2_rules(const char *srcdir) { + _cleanup_close_ int load2_fd = -1, change_fd = -1; + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *entry; + int dfd = -1, r = 0; + + load2_fd = open("/sys/fs/smackfs/load2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (load2_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/load2': %m"); + return -errno; /* negative error */ + } + + change_fd = open("/sys/fs/smackfs/change-rule", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (change_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/change-rule': %m"); + return -errno; /* negative error */ + } + + /* write rules to load2 or change-rule from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + _cleanup_fclose_ FILE *policy = NULL; + + dirent_ensure_type(dir, entry); + if (!dirent_is_file(entry)) + continue; + + if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0) + continue; + + /* load2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL, *sbj = NULL, *obj = NULL, *acc1 = NULL, *acc2 = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name); + if (q == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, buf[0])) + continue; + + /* if 3 args -> load rule : subject object access1 */ + /* if 4 args -> change rule : subject object access1 access2 */ + if (sscanf(buf, "%ms %ms %ms %ms", &sbj, &obj, &acc1, &acc2) < 3) { + log_error_errno(errno, "Failed to parse rule '%s' in '%s', ignoring.", buf, entry->d_name); + continue; + } + + if (write(isempty(acc2) ? load2_fd : change_fd, buf, strlen(buf)) < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to write '%s' to '%s' in '%s': %m", + buf, isempty(acc2) ? "/sys/fs/smackfs/load2" : "/sys/fs/smackfs/change-rule", entry->d_name); + } + } + } + + return r; +} + +static int write_cipso2_rules(const char *srcdir) { + _cleanup_close_ int cipso2_fd = -1; + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *entry; + int dfd = -1, r = 0; + + cipso2_fd = open("/sys/fs/smackfs/cipso2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (cipso2_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/cipso2': %m"); + return -errno; /* negative error */ + } + + /* write rules to cipso2 from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + _cleanup_fclose_ FILE *policy = NULL; + + dirent_ensure_type(dir, entry); + if (!dirent_is_file(entry)) + continue; + + if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0) + continue; + + /* cipso2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name); + if (q == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, buf[0])) + continue; + + if (write(cipso2_fd, buf, strlen(buf)) < 0) { + if (r == 0) + r = -errno; + log_error_errno(errno, "Failed to write '%s' to '/sys/fs/smackfs/cipso2' in '%s': %m", + buf, entry->d_name); + break; + } + } + } + + return r; +} + +static int write_netlabel_rules(const char *srcdir) { + _cleanup_fclose_ FILE *dst = NULL; + _cleanup_closedir_ DIR *dir = NULL; + struct dirent *entry; + int dfd = -1, r = 0; + + dst = fopen("/sys/fs/smackfs/netlabel", "we"); + if (!dst) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open /sys/fs/smackfs/netlabel: %m"); + return -errno; /* negative error */ + } + + /* write rules to dst from every file in the directory */ + dir = opendir(srcdir); + if (!dir) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to opendir %s: %m", srcdir); + return errno; /* positive on purpose */ + } + + dfd = dirfd(dir); + assert(dfd >= 0); + + FOREACH_DIRENT(entry, dir, return 0) { + _cleanup_fclose_ FILE *policy = NULL; + + if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0) + continue; + + /* load2 write rules in the kernel require a line buffered stream */ + for (;;) { + _cleanup_free_ char *buf = NULL; + int q; + + q = read_line(policy, NAME_MAX, &buf); + if (q < 0) + return log_error_errno(q, "Failed to read line from %s: %m", entry->d_name); + if (q == 0) + break; + + if (!fputs(buf, dst)) { + if (r == 0) + r = -EINVAL; + log_error_errno(errno, "Failed to write line to /sys/fs/smackfs/netlabel: %m"); + break; + } + q = fflush_and_check(dst); + if (q < 0) { + if (r == 0) + r = q; + log_error_errno(q, "Failed to flush writes to /sys/fs/smackfs/netlabel: %m"); + break; + } + } + } + + return r; +} + +static int write_onlycap_list(void) { + _cleanup_close_ int onlycap_fd = -1; + _cleanup_free_ char *list = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t len = 0, allocated = 0; + int r; + + f = fopen("/etc/smack/onlycap", "re"); + if (!f) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to read '/etc/smack/onlycap': %m"); + + return errno == ENOENT ? ENOENT : -errno; + } + + for (;;) { + _cleanup_free_ char *buf = NULL; + size_t l; + + r = read_line(f, LONG_LINE_MAX, &buf); + if (r < 0) + return log_error_errno(r, "Failed to read line from /etc/smack/onlycap: %m"); + if (r == 0) + break; + + if (isempty(buf) || strchr(COMMENTS, *buf)) + continue; + + l = strlen(buf); + if (!GREEDY_REALLOC(list, allocated, len + l + 1)) + return log_oom(); + + stpcpy(list + len, buf)[0] = ' '; + len += l + 1; + } + + if (len == 0) + return 0; + + list[len - 1] = 0; + + onlycap_fd = open("/sys/fs/smackfs/onlycap", O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (onlycap_fd < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/onlycap': %m"); + return -errno; /* negative error */ + } + + r = write(onlycap_fd, list, len); + if (r < 0) + return log_error_errno(errno, "Failed to write onlycap list(%s) to '/sys/fs/smackfs/onlycap': %m", list); + + return 0; +} + +#endif + +int mac_smack_setup(bool *loaded_policy) { + +#if ENABLE_SMACK + + int r; + + assert(loaded_policy); + + r = write_access2_rules("/etc/smack/accesses.d/"); + switch(r) { + case -ENOENT: + log_debug("Smack is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack access rules directory '/etc/smack/accesses.d/' not found"); + return 0; + case 0: + log_info("Successfully loaded Smack policies."); + break; + default: + log_warning_errno(r, "Failed to load Smack access rules, ignoring: %m"); + return 0; + } + +#ifdef SMACK_RUN_LABEL + r = write_string_file("/proc/self/attr/current", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK label \"" SMACK_RUN_LABEL "\" on self: %m"); + r = write_string_file("/sys/fs/smackfs/ambient", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK ambient label \"" SMACK_RUN_LABEL "\": %m"); + r = write_string_file("/sys/fs/smackfs/netlabel", + "0.0.0.0/0 " SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK netlabel rule \"0.0.0.0/0 " SMACK_RUN_LABEL "\": %m"); + r = write_string_file("/sys/fs/smackfs/netlabel", "127.0.0.1 -CIPSO", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set SMACK netlabel rule \"127.0.0.1 -CIPSO\": %m"); +#endif + + r = write_cipso2_rules("/etc/smack/cipso.d/"); + switch(r) { + case -ENOENT: + log_debug("Smack/CIPSO is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack/CIPSO access rules directory '/etc/smack/cipso.d/' not found"); + break; + case 0: + log_info("Successfully loaded Smack/CIPSO policies."); + break; + default: + log_warning_errno(r, "Failed to load Smack/CIPSO access rules, ignoring: %m"); + break; + } + + r = write_netlabel_rules("/etc/smack/netlabel.d/"); + switch(r) { + case -ENOENT: + log_debug("Smack/CIPSO is not enabled in the kernel."); + return 0; + case ENOENT: + log_debug("Smack network host rules directory '/etc/smack/netlabel.d/' not found"); + break; + case 0: + log_info("Successfully loaded Smack network host rules."); + break; + default: + log_warning_errno(r, "Failed to load Smack network host rules: %m, ignoring."); + break; + } + + r = write_onlycap_list(); + switch(r) { + case -ENOENT: + log_debug("Smack is not enabled in the kernel."); + break; + case ENOENT: + log_debug("Smack onlycap list file '/etc/smack/onlycap' not found"); + break; + case 0: + log_info("Successfully wrote Smack onlycap list."); + break; + default: + log_emergency_errno(r, "Failed to write Smack onlycap list: %m"); + return r; + } + + *loaded_policy = true; + +#endif + + return 0; +} diff --git a/src/core/smack-setup.h b/src/core/smack-setup.h new file mode 100644 index 0000000..d29370d --- /dev/null +++ b/src/core/smack-setup.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation + Authors: + Nathaniel Chen <nathaniel.chen@intel.com> +***/ + +int mac_smack_setup(bool *loaded_policy); diff --git a/src/core/socket.c b/src/core/socket.c new file mode 100644 index 0000000..7f8ac4e --- /dev/null +++ b/src/core/socket.c @@ -0,0 +1,3533 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <arpa/inet.h> +#include <errno.h> +#include <fcntl.h> +#include <mqueue.h> +#include <netinet/tcp.h> +#include <sys/epoll.h> +#include <sys/stat.h> +#include <unistd.h> +#include <linux/sctp.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-error.h" +#include "bus-util.h" +#include "copy.h" +#include "dbus-socket.h" +#include "dbus-unit.h" +#include "def.h" +#include "errno-list.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "in-addr-util.h" +#include "io-util.h" +#include "ip-protocol-list.h" +#include "label.h" +#include "log.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "serialize.h" +#include "signal-util.h" +#include "smack-util.h" +#include "socket.h" +#include "socket-netlink.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" + +struct SocketPeer { + unsigned n_ref; + + Socket *socket; + union sockaddr_union peer; + socklen_t peer_salen; +}; + +static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = { + [SOCKET_DEAD] = UNIT_INACTIVE, + [SOCKET_START_PRE] = UNIT_ACTIVATING, + [SOCKET_START_CHOWN] = UNIT_ACTIVATING, + [SOCKET_START_POST] = UNIT_ACTIVATING, + [SOCKET_LISTENING] = UNIT_ACTIVE, + [SOCKET_RUNNING] = UNIT_ACTIVE, + [SOCKET_STOP_PRE] = UNIT_DEACTIVATING, + [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_STOP_POST] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_FAILED] = UNIT_FAILED, + [SOCKET_CLEANING] = UNIT_MAINTENANCE, +}; + +static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static void flush_ports(Socket *s); + +static void socket_init(Unit *u) { + Socket *s = SOCKET(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + s->backlog = SOMAXCONN; + s->timeout_usec = u->manager->default_timeout_start_usec; + s->directory_mode = 0755; + s->socket_mode = 0666; + + s->max_connections = 64; + + s->priority = -1; + s->ip_tos = -1; + s->ip_ttl = -1; + s->mark = -1; + + s->exec_context.std_output = u->manager->default_std_output; + s->exec_context.std_error = u->manager->default_std_error; + + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + s->trigger_limit.interval = USEC_INFINITY; + s->trigger_limit.burst = (unsigned) -1; +} + +static void socket_unwatch_control_pid(Socket *s) { + assert(s); + + if (s->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->control_pid); + s->control_pid = 0; +} + +static void socket_cleanup_fd_list(SocketPort *p) { + assert(p); + + close_many(p->auxiliary_fds, p->n_auxiliary_fds); + p->auxiliary_fds = mfree(p->auxiliary_fds); + p->n_auxiliary_fds = 0; +} + +void socket_free_ports(Socket *s) { + SocketPort *p; + + assert(s); + + while ((p = s->ports)) { + LIST_REMOVE(port, s->ports, p); + + sd_event_source_unref(p->event_source); + + socket_cleanup_fd_list(p); + safe_close(p->fd); + free(p->path); + free(p); + } +} + +static void socket_done(Unit *u) { + Socket *s = SOCKET(u); + SocketPeer *p; + + assert(s); + + socket_free_ports(s); + + while ((p = set_steal_first(s->peers_by_address))) + p->socket = NULL; + + s->peers_by_address = set_free(s->peers_by_address); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, false); + exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); + s->control_command = NULL; + + dynamic_creds_unref(&s->dynamic_creds); + + socket_unwatch_control_pid(s); + + unit_ref_unset(&s->service); + + s->tcp_congestion = mfree(s->tcp_congestion); + s->bind_to_device = mfree(s->bind_to_device); + + s->smack = mfree(s->smack); + s->smack_ip_in = mfree(s->smack_ip_in); + s->smack_ip_out = mfree(s->smack_ip_out); + + strv_free(s->symlinks); + + s->user = mfree(s->user); + s->group = mfree(s->group); + + s->fdname = mfree(s->fdname); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); +} + +static int socket_arm_timer(Socket *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + socket_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "socket-timer"); + + return 0; +} + +static bool have_non_accept_socket(Socket *s) { + SocketPort *p; + + assert(s); + + if (!s->accept) + return true; + + LIST_FOREACH(port, p, s->ports) { + + if (p->type != SOCKET_SOCKET) + return true; + + if (!socket_address_can_accept(&p->address)) + return true; + } + + return false; +} + +static int socket_add_mount_dependencies(Socket *s) { + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + const char *path = NULL; + + if (p->type == SOCKET_SOCKET) + path = socket_address_get_path(&p->address); + else if (IN_SET(p->type, SOCKET_FIFO, SOCKET_SPECIAL, SOCKET_USB_FUNCTION)) + path = p->path; + + if (!path) + continue; + + r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + return 0; +} + +static int socket_add_device_dependencies(Socket *s) { + char *t; + + assert(s); + + if (!s->bind_to_device || streq(s->bind_to_device, "lo")) + return 0; + + t = strjoina("/sys/subsystem/net/devices/", s->bind_to_device); + return unit_add_node_dependency(UNIT(s), t, UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE); +} + +static int socket_add_default_dependencies(Socket *s) { + int r; + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SOCKETS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + } + + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +_pure_ static bool socket_has_exec(Socket *s) { + unsigned i; + assert(s); + + for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++) + if (s->exec_command[i]) + return true; + + return false; +} + +static int socket_add_extras(Socket *s) { + Unit *u = UNIT(s); + int r; + + assert(s); + + /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit + * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept() + * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly + * and reliably. This is different for Accept=no, where the spawned service has to take the incoming traffic + * off the queues, which it might not necessarily do. Moreover, while Accept=no services are supposed to + * process whatever is queued in one go, and thus should normally never have to be started frequently. This is + * different for Accept=yes where each connection is processed by a new service instance, and thus frequent + * service starts are typical. */ + + if (s->trigger_limit.interval == USEC_INFINITY) + s->trigger_limit.interval = 2 * USEC_PER_SEC; + + if (s->trigger_limit.burst == (unsigned) -1) { + if (s->accept) + s->trigger_limit.burst = 200; + else + s->trigger_limit.burst = 20; + } + + if (have_non_accept_socket(s)) { + + if (!UNIT_DEREF(s->service)) { + Unit *x; + + r = unit_load_related_unit(u, ".service", &x); + if (r < 0) + return r; + + unit_ref_set(&s->service, u, x); + } + + r = unit_add_two_dependencies(u, UNIT_BEFORE, UNIT_TRIGGERS, UNIT_DEREF(s->service), true, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + } + + r = socket_add_mount_dependencies(s); + if (r < 0) + return r; + + r = socket_add_device_dependencies(s); + if (r < 0) + return r; + + r = unit_patch_contexts(u); + if (r < 0) + return r; + + if (socket_has_exec(s)) { + r = unit_add_exec_dependencies(u, &s->exec_context); + if (r < 0) + return r; + } + + r = unit_set_default_slice(u); + if (r < 0) + return r; + + r = socket_add_default_dependencies(s); + if (r < 0) + return r; + + return 0; +} + +static const char *socket_find_symlink_target(Socket *s) { + const char *found = NULL; + SocketPort *p; + + LIST_FOREACH(port, p, s->ports) { + const char *f = NULL; + + switch (p->type) { + + case SOCKET_FIFO: + f = p->path; + break; + + case SOCKET_SOCKET: + f = socket_address_get_path(&p->address); + break; + + default: + break; + } + + if (f) { + if (found) + return NULL; + + found = f; + } + } + + return found; +} + +static int socket_verify(Socket *s) { + assert(s); + assert(UNIT(s)->load_state == UNIT_LOADED); + + if (!s->ports) { + log_unit_error(UNIT(s), "Unit has no Listen setting (ListenStream=, ListenDatagram=, ListenFIFO=, ...). Refusing."); + return -ENOEXEC; + } + + if (s->accept && have_non_accept_socket(s)) { + log_unit_error(UNIT(s), "Unit configured for accepting sockets, but sockets are non-accepting. Refusing."); + return -ENOEXEC; + } + + if (s->accept && s->max_connections <= 0) { + log_unit_error(UNIT(s), "MaxConnection= setting too small. Refusing."); + return -ENOEXEC; + } + + if (s->accept && UNIT_DEREF(s->service)) { + log_unit_error(UNIT(s), "Explicit service configuration for accepting socket units not supported. Refusing."); + return -ENOEXEC; + } + + if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP) { + log_unit_error(UNIT(s), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing."); + return -ENOEXEC; + } + + if (!strv_isempty(s->symlinks) && !socket_find_symlink_target(s)) { + log_unit_error(UNIT(s), "Unit has symlinks set but none or more than one node in the file system. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) { + assert(s); + + if (s->peer.sa.sa_family == AF_INET) + siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state); + else if (s->peer.sa.sa_family == AF_INET6) + siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state); + else if (s->peer.sa.sa_family == AF_VSOCK) + siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state); + else + assert_not_reached("Unknown address family."); +} + +static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) { + int r; + + r = CMP(x->peer.sa.sa_family, y->peer.sa.sa_family); + if (r != 0) + return r; + + switch(x->peer.sa.sa_family) { + case AF_INET: + return memcmp(&x->peer.in.sin_addr, &y->peer.in.sin_addr, sizeof(x->peer.in.sin_addr)); + case AF_INET6: + return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr)); + case AF_VSOCK: + return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid); + } + assert_not_reached("Black sheep in the family!"); +} + +DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func); + +static int socket_load(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = set_ensure_allocated(&s->peers_by_address, &peer_address_hash_ops); + if (r < 0) + return r; + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = socket_add_extras(s); + if (r < 0) + return r; + + return socket_verify(s); +} + +static SocketPeer *socket_peer_new(void) { + SocketPeer *p; + + p = new0(SocketPeer, 1); + if (!p) + return NULL; + + p->n_ref = 1; + + return p; +} + +static SocketPeer *socket_peer_free(SocketPeer *p) { + assert(p); + + if (p->socket) + set_remove(p->socket->peers_by_address, p); + + return mfree(p); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free); + +int socket_acquire_peer(Socket *s, int fd, SocketPeer **p) { + _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL; + SocketPeer sa = {}, *i; + socklen_t salen = sizeof(sa.peer); + int r; + + assert(fd >= 0); + assert(s); + + if (getpeername(fd, &sa.peer.sa, &salen) < 0) + return log_unit_error_errno(UNIT(s), errno, "getpeername failed: %m"); + + if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + *p = NULL; + return 0; + } + + i = set_get(s->peers_by_address, &sa); + if (i) { + *p = socket_peer_ref(i); + return 1; + } + + remote = socket_peer_new(); + if (!remote) + return log_oom(); + + remote->peer = sa.peer; + remote->peer_salen = salen; + + r = set_put(s->peers_by_address, remote); + if (r < 0) + return r; + + remote->socket = s; + + *p = TAKE_PTR(remote); + + return 1; +} + +_const_ static const char* listen_lookup(int family, int type) { + + if (family == AF_NETLINK) + return "ListenNetlink"; + + if (type == SOCK_STREAM) + return "ListenStream"; + else if (type == SOCK_DGRAM) + return "ListenDatagram"; + else if (type == SOCK_SEQPACKET) + return "ListenSequentialPacket"; + + assert_not_reached("Unknown socket type"); + return NULL; +} + +static void socket_dump(Unit *u, FILE *f, const char *prefix) { + char time_string[FORMAT_TIMESPAN_MAX]; + SocketExecCommand c; + Socket *s = SOCKET(u); + SocketPort *p; + const char *prefix2, *str; + + assert(s); + assert(f); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%sSocket State: %s\n" + "%sResult: %s\n" + "%sClean Result: %s\n" + "%sBindIPv6Only: %s\n" + "%sBacklog: %u\n" + "%sSocketMode: %04o\n" + "%sDirectoryMode: %04o\n" + "%sKeepAlive: %s\n" + "%sNoDelay: %s\n" + "%sFreeBind: %s\n" + "%sTransparent: %s\n" + "%sBroadcast: %s\n" + "%sPassCredentials: %s\n" + "%sPassSecurity: %s\n" + "%sPassPacketInfo: %s\n" + "%sTCPCongestion: %s\n" + "%sRemoveOnStop: %s\n" + "%sWritable: %s\n" + "%sFileDescriptorName: %s\n" + "%sSELinuxContextFromNet: %s\n", + prefix, socket_state_to_string(s->state), + prefix, socket_result_to_string(s->result), + prefix, socket_result_to_string(s->clean_result), + prefix, socket_address_bind_ipv6_only_to_string(s->bind_ipv6_only), + prefix, s->backlog, + prefix, s->socket_mode, + prefix, s->directory_mode, + prefix, yes_no(s->keep_alive), + prefix, yes_no(s->no_delay), + prefix, yes_no(s->free_bind), + prefix, yes_no(s->transparent), + prefix, yes_no(s->broadcast), + prefix, yes_no(s->pass_cred), + prefix, yes_no(s->pass_sec), + prefix, yes_no(s->pass_pktinfo), + prefix, strna(s->tcp_congestion), + prefix, yes_no(s->remove_on_stop), + prefix, yes_no(s->writable), + prefix, socket_fdname(s), + prefix, yes_no(s->selinux_context_from_net)); + + if (s->timestamping != SOCKET_TIMESTAMPING_OFF) + fprintf(f, + "%sTimestamping: %s\n", + prefix, socket_timestamping_to_string(s->timestamping)); + + if (s->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid); + + if (s->bind_to_device) + fprintf(f, + "%sBindToDevice: %s\n", + prefix, s->bind_to_device); + + if (s->accept) + fprintf(f, + "%sAccepted: %u\n" + "%sNConnections: %u\n" + "%sMaxConnections: %u\n" + "%sMaxConnectionsPerSource: %u\n", + prefix, s->n_accepted, + prefix, s->n_connections, + prefix, s->max_connections, + prefix, s->max_connections_per_source); + else + fprintf(f, + "%sFlushPending: %s\n", + prefix, yes_no(s->flush_pending)); + + + if (s->priority >= 0) + fprintf(f, + "%sPriority: %i\n", + prefix, s->priority); + + if (s->receive_buffer > 0) + fprintf(f, + "%sReceiveBuffer: %zu\n", + prefix, s->receive_buffer); + + if (s->send_buffer > 0) + fprintf(f, + "%sSendBuffer: %zu\n", + prefix, s->send_buffer); + + if (s->ip_tos >= 0) + fprintf(f, + "%sIPTOS: %i\n", + prefix, s->ip_tos); + + if (s->ip_ttl >= 0) + fprintf(f, + "%sIPTTL: %i\n", + prefix, s->ip_ttl); + + if (s->pipe_size > 0) + fprintf(f, + "%sPipeSize: %zu\n", + prefix, s->pipe_size); + + if (s->mark >= 0) + fprintf(f, + "%sMark: %i\n", + prefix, s->mark); + + if (s->mq_maxmsg > 0) + fprintf(f, + "%sMessageQueueMaxMessages: %li\n", + prefix, s->mq_maxmsg); + + if (s->mq_msgsize > 0) + fprintf(f, + "%sMessageQueueMessageSize: %li\n", + prefix, s->mq_msgsize); + + if (s->reuse_port) + fprintf(f, + "%sReusePort: %s\n", + prefix, yes_no(s->reuse_port)); + + if (s->smack) + fprintf(f, + "%sSmackLabel: %s\n", + prefix, s->smack); + + if (s->smack_ip_in) + fprintf(f, + "%sSmackLabelIPIn: %s\n", + prefix, s->smack_ip_in); + + if (s->smack_ip_out) + fprintf(f, + "%sSmackLabelIPOut: %s\n", + prefix, s->smack_ip_out); + + if (!isempty(s->user) || !isempty(s->group)) + fprintf(f, + "%sSocketUser: %s\n" + "%sSocketGroup: %s\n", + prefix, strna(s->user), + prefix, strna(s->group)); + + if (s->keep_alive_time > 0) + fprintf(f, + "%sKeepAliveTimeSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->keep_alive_time, USEC_PER_SEC)); + + if (s->keep_alive_interval > 0) + fprintf(f, + "%sKeepAliveIntervalSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->keep_alive_interval, USEC_PER_SEC)); + + if (s->keep_alive_cnt > 0) + fprintf(f, + "%sKeepAliveProbes: %u\n", + prefix, s->keep_alive_cnt); + + if (s->defer_accept > 0) + fprintf(f, + "%sDeferAcceptSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->defer_accept, USEC_PER_SEC)); + + LIST_FOREACH(port, p, s->ports) { + + switch (p->type) { + case SOCKET_SOCKET: { + _cleanup_free_ char *k = NULL; + const char *t; + int r; + + r = socket_address_print(&p->address, &k); + if (r < 0) + t = strerror_safe(r); + else + t = k; + + fprintf(f, "%s%s: %s\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type), t); + break; + } + case SOCKET_SPECIAL: + fprintf(f, "%sListenSpecial: %s\n", prefix, p->path); + break; + case SOCKET_USB_FUNCTION: + fprintf(f, "%sListenUSBFunction: %s\n", prefix, p->path); + break; + case SOCKET_MQUEUE: + fprintf(f, "%sListenMessageQueue: %s\n", prefix, p->path); + break; + default: + fprintf(f, "%sListenFIFO: %s\n", prefix, p->path); + } + } + + fprintf(f, + "%sTriggerLimitIntervalSec: %s\n" + "%sTriggerLimitBurst: %u\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->trigger_limit.interval, USEC_PER_SEC), + prefix, s->trigger_limit.burst); + + str = ip_protocol_to_name(s->socket_protocol); + if (str) + fprintf(f, "%sSocketProtocol: %s\n", prefix, str); + + if (!strv_isempty(s->symlinks)) { + char **q; + + fprintf(f, "%sSymlinks:", prefix); + STRV_FOREACH(q, s->symlinks) + fprintf(f, " %s", *q); + + fprintf(f, "\n"); + } + + fprintf(f, + "%sTimeoutSec: %s\n", + prefix, format_timespan(time_string, FORMAT_TIMESPAN_MAX, s->timeout_usec, USEC_PER_SEC)); + + exec_context_dump(&s->exec_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); + + for (c = 0; c < _SOCKET_EXEC_COMMAND_MAX; c++) { + if (!s->exec_command[c]) + continue; + + fprintf(f, "%s-> %s:\n", + prefix, socket_exec_command_to_string(c)); + + exec_command_dump_list(s->exec_command[c], f, prefix2); + } + + cgroup_context_dump(UNIT(s), f, prefix); +} + +static int instance_from_socket(int fd, unsigned nr, char **instance) { + socklen_t l; + char *r; + union sockaddr_union local, remote; + + assert(fd >= 0); + assert(instance); + + l = sizeof(local); + if (getsockname(fd, &local.sa, &l) < 0) + return -errno; + + l = sizeof(remote); + if (getpeername(fd, &remote.sa, &l) < 0) + return -errno; + + switch (local.sa.sa_family) { + + case AF_INET: { + uint32_t + a = be32toh(local.in.sin_addr.s_addr), + b = be32toh(remote.in.sin_addr.s_addr); + + if (asprintf(&r, + "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u", + nr, + a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF, + be16toh(local.in.sin_port), + b >> 24, (b >> 16) & 0xFF, (b >> 8) & 0xFF, b & 0xFF, + be16toh(remote.in.sin_port)) < 0) + return -ENOMEM; + + break; + } + + case AF_INET6: { + static const unsigned char ipv4_prefix[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF + }; + + if (memcmp(&local.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0 && + memcmp(&remote.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) { + const uint8_t + *a = local.in6.sin6_addr.s6_addr+12, + *b = remote.in6.sin6_addr.s6_addr+12; + + if (asprintf(&r, + "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u", + nr, + a[0], a[1], a[2], a[3], + be16toh(local.in6.sin6_port), + b[0], b[1], b[2], b[3], + be16toh(remote.in6.sin6_port)) < 0) + return -ENOMEM; + } else { + char a[INET6_ADDRSTRLEN], b[INET6_ADDRSTRLEN]; + + if (asprintf(&r, + "%u-%s:%u-%s:%u", + nr, + inet_ntop(AF_INET6, &local.in6.sin6_addr, a, sizeof(a)), + be16toh(local.in6.sin6_port), + inet_ntop(AF_INET6, &remote.in6.sin6_addr, b, sizeof(b)), + be16toh(remote.in6.sin6_port)) < 0) + return -ENOMEM; + } + + break; + } + + case AF_UNIX: { + struct ucred ucred; + int k; + + k = getpeercred(fd, &ucred); + if (k >= 0) { + if (asprintf(&r, + "%u-"PID_FMT"-"UID_FMT, + nr, ucred.pid, ucred.uid) < 0) + return -ENOMEM; + } else if (k == -ENODATA) { + /* This handles the case where somebody is + * connecting from another pid/uid namespace + * (e.g. from outside of our container). */ + if (asprintf(&r, + "%u-unknown", + nr) < 0) + return -ENOMEM; + } else + return k; + + break; + } + + case AF_VSOCK: + if (asprintf(&r, + "%u-%u:%u-%u:%u", + nr, + local.vm.svm_cid, local.vm.svm_port, + remote.vm.svm_cid, remote.vm.svm_port) < 0) + return -ENOMEM; + + break; + + default: + assert_not_reached("Unhandled socket type."); + } + + *instance = r; + return 0; +} + +static void socket_close_fds(Socket *s) { + SocketPort *p; + char **i; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + bool was_open; + + was_open = p->fd >= 0; + + p->event_source = sd_event_source_unref(p->event_source); + p->fd = safe_close(p->fd); + socket_cleanup_fd_list(p); + + /* One little note: we should normally not delete any sockets in the file system here! After all some + * other process we spawned might still have a reference of this fd and wants to continue to use + * it. Therefore we normally delete sockets in the file system before we create a new one, not after we + * stopped using one! That all said, if the user explicitly requested this, we'll delete them here + * anyway, but only then. */ + + if (!was_open || !s->remove_on_stop) + continue; + + switch (p->type) { + + case SOCKET_FIFO: + (void) unlink(p->path); + break; + + case SOCKET_MQUEUE: + (void) mq_unlink(p->path); + break; + + case SOCKET_SOCKET: + (void) socket_address_unlink(&p->address); + break; + + default: + break; + } + } + + if (s->remove_on_stop) + STRV_FOREACH(i, s->symlinks) + (void) unlink(*i); +} + +static void socket_apply_socket_options(Socket *s, SocketPort *p, int fd) { + int r; + + assert(s); + assert(p); + assert(fd >= 0); + + if (s->keep_alive) { + r = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_KEEPALIVE failed: %m"); + } + + if (s->keep_alive_time > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPIDLE, s->keep_alive_time / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPIDLE failed: %m"); + } + + if (s->keep_alive_interval > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPINTVL, s->keep_alive_interval / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPINTVL failed: %m"); + } + + if (s->keep_alive_cnt > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_KEEPCNT, s->keep_alive_cnt); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_KEEPCNT failed: %m"); + } + + if (s->defer_accept > 0) { + r = setsockopt_int(fd, SOL_TCP, TCP_DEFER_ACCEPT, s->defer_accept / USEC_PER_SEC); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_DEFER_ACCEPT failed: %m"); + } + + if (s->no_delay) { + if (s->socket_protocol == IPPROTO_SCTP) { + r = setsockopt_int(fd, SOL_SCTP, SCTP_NODELAY, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SCTP_NODELAY failed: %m"); + } else { + r = setsockopt_int(fd, SOL_TCP, TCP_NODELAY, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "TCP_NODELAY failed: %m"); + } + } + + if (s->broadcast) { + r = setsockopt_int(fd, SOL_SOCKET, SO_BROADCAST, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_BROADCAST failed: %m"); + } + + if (s->pass_cred) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PASSCRED failed: %m"); + } + + if (s->pass_sec) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PASSSEC failed: %m"); + } + + if (s->pass_pktinfo) { + r = socket_set_recvpktinfo(fd, socket_address_family(&p->address), true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to enable packet info socket option: %m"); + } + + if (s->timestamping != SOCKET_TIMESTAMPING_OFF) { + r = setsockopt_int(fd, SOL_SOCKET, + s->timestamping == SOCKET_TIMESTAMPING_NS ? SO_TIMESTAMPNS : SO_TIMESTAMP, + true); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to enable timestamping socket option, ignoring: %m"); + } + + if (s->priority >= 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_PRIORITY, s->priority); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_PRIORITY failed: %m"); + } + + if (s->receive_buffer > 0) { + r = fd_set_rcvbuf(fd, s->receive_buffer, false); + if (r < 0) + log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "SO_RCVBUF/SO_RCVBUFFORCE failed: %m"); + } + + if (s->send_buffer > 0) { + r = fd_set_sndbuf(fd, s->send_buffer, false); + if (r < 0) + log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "SO_SNDBUF/SO_SNDBUFFORCE failed: %m"); + } + + if (s->mark >= 0) { + r = setsockopt_int(fd, SOL_SOCKET, SO_MARK, s->mark); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "SO_MARK failed: %m"); + } + + if (s->ip_tos >= 0) { + r = setsockopt_int(fd, IPPROTO_IP, IP_TOS, s->ip_tos); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "IP_TOS failed: %m"); + } + + if (s->ip_ttl >= 0) { + r = socket_set_ttl(fd, socket_address_family(&p->address), s->ip_ttl); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "IP_TTL/IPV6_UNICAST_HOPS failed: %m"); + } + + if (s->tcp_congestion) + if (setsockopt(fd, SOL_TCP, TCP_CONGESTION, s->tcp_congestion, strlen(s->tcp_congestion)+1) < 0) + log_unit_warning_errno(UNIT(s), errno, "TCP_CONGESTION failed: %m"); + + if (s->smack_ip_in) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_IPIN, s->smack_ip_in); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_in_fd: %m"); + } + + if (s->smack_ip_out) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_IPOUT, s->smack_ip_out); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_out_fd: %m"); + } +} + +static void socket_apply_fifo_options(Socket *s, int fd) { + int r; + + assert(s); + assert(fd >= 0); + + if (s->pipe_size > 0) + if (fcntl(fd, F_SETPIPE_SZ, s->pipe_size) < 0) + log_unit_warning_errno(UNIT(s), errno, "Setting pipe size failed, ignoring: %m"); + + if (s->smack) { + r = mac_smack_apply_fd(fd, SMACK_ATTR_ACCESS, s->smack); + if (r < 0) + log_unit_error_errno(UNIT(s), r, "SMACK relabelling failed, ignoring: %m"); + } +} + +static int fifo_address_create( + const char *path, + mode_t directory_mode, + mode_t socket_mode) { + + _cleanup_close_ int fd = -1; + mode_t old_mask; + struct stat st; + int r; + + assert(path); + + (void) mkdir_parents_label(path, directory_mode); + + r = mac_selinux_create_file_prepare(path, S_IFIFO); + if (r < 0) + return r; + + /* Enforce the right access mode for the fifo */ + old_mask = umask(~socket_mode); + + /* Include the original umask in our mask */ + (void) umask(~socket_mode | old_mask); + + r = mkfifo(path, socket_mode); + (void) umask(old_mask); + + if (r < 0 && errno != EEXIST) { + r = -errno; + goto fail; + } + + fd = open(path, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK | O_NOFOLLOW); + if (fd < 0) { + r = -errno; + goto fail; + } + + mac_selinux_create_file_clear(); + + if (fstat(fd, &st) < 0) { + r = -errno; + goto fail; + } + + if (!S_ISFIFO(st.st_mode) || + (st.st_mode & 0777) != (socket_mode & ~old_mask) || + st.st_uid != getuid() || + st.st_gid != getgid()) { + r = -EEXIST; + goto fail; + } + + return TAKE_FD(fd); + +fail: + mac_selinux_create_file_clear(); + return r; +} + +static int special_address_create(const char *path, bool writable) { + _cleanup_close_ int fd = -1; + struct stat st; + + assert(path); + + fd = open(path, (writable ? O_RDWR : O_RDONLY)|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check whether this is a /proc, /sys or /dev file or char device */ + if (!S_ISREG(st.st_mode) && !S_ISCHR(st.st_mode)) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int usbffs_address_create(const char *path) { + _cleanup_close_ int fd = -1; + struct stat st; + + assert(path); + + fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check whether this is a regular file (ffs endpoint) */ + if (!S_ISREG(st.st_mode)) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int mq_address_create( + const char *path, + mode_t mq_mode, + long maxmsg, + long msgsize) { + + _cleanup_close_ int fd = -1; + struct stat st; + mode_t old_mask; + struct mq_attr _attr, *attr = NULL; + + assert(path); + + if (maxmsg > 0 && msgsize > 0) { + _attr = (struct mq_attr) { + .mq_flags = O_NONBLOCK, + .mq_maxmsg = maxmsg, + .mq_msgsize = msgsize, + }; + attr = &_attr; + } + + /* Enforce the right access mode for the mq */ + old_mask = umask(~mq_mode); + + /* Include the original umask in our mask */ + (void) umask(~mq_mode | old_mask); + fd = mq_open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_CREAT, mq_mode, attr); + (void) umask(old_mask); + + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if ((st.st_mode & 0777) != (mq_mode & ~old_mask) || + st.st_uid != getuid() || + st.st_gid != getgid()) + return -EEXIST; + + return TAKE_FD(fd); +} + +static int socket_symlink(Socket *s) { + const char *p; + char **i; + int r; + + assert(s); + + p = socket_find_symlink_target(s); + if (!p) + return 0; + + STRV_FOREACH(i, s->symlinks) { + (void) mkdir_parents_label(*i, s->directory_mode); + + r = symlink_idempotent(p, *i, false); + + if (r == -EEXIST && s->remove_on_stop) { + /* If there's already something where we want to create the symlink, and the destructive + * RemoveOnStop= mode is set, then we might as well try to remove what already exists and try + * again. */ + + if (unlink(*i) >= 0) + r = symlink_idempotent(p, *i, false); + } + + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to create symlink %s → %s, ignoring: %m", p, *i); + } + + return 0; +} + +static int usbffs_write_descs(int fd, Service *s) { + int r; + + if (!s->usb_function_descriptors || !s->usb_function_strings) + return -EINVAL; + + r = copy_file_fd(s->usb_function_descriptors, fd, 0); + if (r < 0) + return r; + + return copy_file_fd(s->usb_function_strings, fd, 0); +} + +static int usbffs_select_ep(const struct dirent *d) { + return d->d_name[0] != '.' && !streq(d->d_name, "ep0"); +} + +static int usbffs_dispatch_eps(SocketPort *p) { + _cleanup_free_ struct dirent **ent = NULL; + size_t n, k, i; + int r; + + r = scandir(p->path, &ent, usbffs_select_ep, alphasort); + if (r < 0) + return -errno; + + n = (size_t) r; + p->auxiliary_fds = new(int, n); + if (!p->auxiliary_fds) { + r = -ENOMEM; + goto clear; + } + + p->n_auxiliary_fds = n; + + k = 0; + for (i = 0; i < n; ++i) { + _cleanup_free_ char *ep = NULL; + + ep = path_make_absolute(ent[i]->d_name, p->path); + if (!ep) { + r = -ENOMEM; + goto fail; + } + + path_simplify(ep, false); + + r = usbffs_address_create(ep); + if (r < 0) + goto fail; + + p->auxiliary_fds[k++] = r; + } + + r = 0; + goto clear; + +fail: + close_many(p->auxiliary_fds, k); + p->auxiliary_fds = mfree(p->auxiliary_fds); + p->n_auxiliary_fds = 0; + +clear: + for (i = 0; i < n; ++i) + free(ent[i]); + + return r; +} + +int socket_load_service_unit(Socket *s, int cfd, Unit **ret) { + /* Figure out what the unit that will be used to handle the connections on the socket looks like. + * + * If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake + * instance name. + */ + + if (UNIT_ISSET(s->service)) { + *ret = UNIT_DEREF(s->service); + return 0; + } + + if (!s->accept) + return -ENODATA; + + /* Build the instance name and load the unit */ + _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL; + int r; + + r = unit_name_to_prefix(UNIT(s)->id, &prefix); + if (r < 0) + return r; + + if (cfd >= 0) { + r = instance_from_socket(cfd, s->n_accepted, &instance); + if (ERRNO_IS_DISCONNECT(r)) + /* ENOTCONN is legitimate if TCP RST was received. Other socket families might return + * different errors. This connection is over, but the socket unit lives on. */ + return log_unit_debug_errno(UNIT(s), r, + "Got %s on incoming socket, assuming aborted connection attempt, ignoring.", + errno_to_name(r)); + if (r < 0) + return r; + } + + /* For accepting sockets, we don't know how the instance will be called until we get a connection and + * can figure out what the peer name is. So let's use "internal" as the instance to make it clear + * that this is not an actual peer name. We use "unknown" when we cannot figure out the peer. */ + r = unit_name_build(prefix, instance ?: "internal", ".service", &name); + if (r < 0) + return r; + + return manager_load_unit(UNIT(s)->manager, name, NULL, NULL, ret); +} + +static int socket_determine_selinux_label(Socket *s, char **ret) { + int r; + + assert(s); + assert(ret); + + if (s->selinux_context_from_net) { + /* If this is requested, get the label from the network label */ + + r = mac_selinux_get_our_label(ret); + if (r == -EOPNOTSUPP) + goto no_label; + + } else { + /* Otherwise, get it from the executable we are about to start. */ + + Unit *service; + ExecCommand *c; + _cleanup_free_ char *path = NULL; + + r = socket_load_service_unit(s, -1, &service); + if (r == -ENODATA) + goto no_label; + if (r < 0) + return r; + + c = SERVICE(service)->exec_command[SERVICE_EXEC_START]; + if (!c) + goto no_label; + + r = chase_symlinks(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL); + if (r < 0) + goto no_label; + + r = mac_selinux_get_create_label_from_exe(path, ret); + if (IN_SET(r, -EPERM, -EOPNOTSUPP)) + goto no_label; + } + + return r; + +no_label: + *ret = NULL; + return 0; +} + +static int socket_address_listen_do( + Socket *s, + const SocketAddress *address, + const char *label) { + + assert(s); + assert(address); + + return socket_address_listen( + address, + SOCK_CLOEXEC|SOCK_NONBLOCK, + s->backlog, + s->bind_ipv6_only, + s->bind_to_device, + s->reuse_port, + s->free_bind, + s->transparent, + s->directory_mode, + s->socket_mode, + label); +} + +#define log_address_error_errno(u, address, error, fmt) \ + ({ \ + _cleanup_free_ char *_t = NULL; \ + \ + (void) socket_address_print(address, &_t); \ + log_unit_error_errno(u, error, fmt, strna(_t)); \ + }) + +static int fork_needed(const SocketAddress *address, const ExecContext *context) { + int r; + + assert(address); + assert(context); + + /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */ + + if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ + return true; + } + + return context->private_network || context->network_namespace_path; +} + +static int socket_address_listen_in_cgroup( + Socket *s, + const SocketAddress *address, + const char *label) { + + _cleanup_close_pair_ int pair[2] = { -1, -1 }; + int fd, r; + pid_t pid; + + assert(s); + assert(address); + + /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the + * socket's cgroup and network namespace in which the socket is actually created. This way we ensure + * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and + * such. */ + + r = fork_needed(address, &s->exec_context); + if (r < 0) + return r; + if (r == 0) { + /* Shortcut things... */ + fd = socket_address_listen_do(s, address, label); + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + + return fd; + } + + r = unit_setup_exec_runtime(UNIT(s)); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m"); + + if (s->exec_context.network_namespace_path && + s->exec_runtime && + s->exec_runtime->netns_storage_socket[0] >= 0) { + r = open_netns_path(s->exec_runtime->netns_storage_socket, s->exec_context.network_namespace_path); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path); + } + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), "(sd-listen)", &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + if ((s->exec_context.private_network || s->exec_context.network_namespace_path) && + s->exec_runtime && + s->exec_runtime->netns_storage_socket[0] >= 0) { + + if (ns_type_supported(NAMESPACE_NET)) { + r = setup_netns(s->exec_runtime->netns_storage_socket); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m"); + _exit(EXIT_NETWORK); + } + } else if (s->exec_context.network_namespace_path) { + log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported."); + _exit(EXIT_NETWORK); + } else + log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); + } + + fd = socket_address_listen_do(s, address, label); + if (fd < 0) { + log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], fd, 0); + if (r < 0) { + log_address_error_errno(UNIT(s), address, r, "Failed to send listening socket (%s) to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + fd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_check("(sd-listen)", pid, WAIT_LOG_ABNORMAL); + if (r < 0) { + safe_close(fd); + return r; + } + + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m"); + + return fd; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Socket *, socket_close_fds); + +static int socket_open_fds(Socket *_s) { + _cleanup_(socket_close_fdsp) Socket *s = _s; + _cleanup_(mac_selinux_freep) char *label = NULL; + bool know_label = false; + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + + if (p->fd >= 0) + continue; + + switch (p->type) { + + case SOCKET_SOCKET: + + if (!know_label) { + /* Figure out the label, if we don't it know yet. We do it once for the first + * socket where we need this and remember it for the rest. */ + + r = socket_determine_selinux_label(s, &label); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to determine SELinux label: %m"); + + know_label = true; + } + + /* Apply the socket protocol */ + switch (p->address.type) { + + case SOCK_STREAM: + case SOCK_SEQPACKET: + if (s->socket_protocol == IPPROTO_SCTP) + p->address.protocol = s->socket_protocol; + break; + + case SOCK_DGRAM: + if (s->socket_protocol == IPPROTO_UDPLITE) + p->address.protocol = s->socket_protocol; + break; + } + + p->fd = socket_address_listen_in_cgroup(s, &p->address, label); + if (p->fd < 0) + return p->fd; + + socket_apply_socket_options(s, p, p->fd); + socket_symlink(s); + break; + + case SOCKET_SPECIAL: + + p->fd = special_address_create(p->path, s->writable); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open special file %s: %m", p->path); + break; + + case SOCKET_FIFO: + + p->fd = fifo_address_create( + p->path, + s->directory_mode, + s->socket_mode); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open FIFO %s: %m", p->path); + + socket_apply_fifo_options(s, p->fd); + socket_symlink(s); + break; + + case SOCKET_MQUEUE: + + p->fd = mq_address_create( + p->path, + s->socket_mode, + s->mq_maxmsg, + s->mq_msgsize); + if (p->fd < 0) + return log_unit_error_errno(UNIT(s), p->fd, "Failed to open message queue %s: %m", p->path); + break; + + case SOCKET_USB_FUNCTION: { + _cleanup_free_ char *ep = NULL; + + ep = path_make_absolute("ep0", p->path); + if (!ep) + return -ENOMEM; + + p->fd = usbffs_address_create(ep); + if (p->fd < 0) + return p->fd; + + r = usbffs_write_descs(p->fd, SERVICE(UNIT_DEREF(s->service))); + if (r < 0) + return r; + + r = usbffs_dispatch_eps(p); + if (r < 0) + return r; + + break; + } + default: + assert_not_reached("Unknown port type"); + } + } + + s = NULL; + return 0; +} + +static void socket_unwatch_fds(Socket *s) { + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + if (!p->event_source) + continue; + + r = sd_event_source_set_enabled(p->event_source, SD_EVENT_OFF); + if (r < 0) + log_unit_debug_errno(UNIT(s), r, "Failed to disable event source: %m"); + } +} + +static int socket_watch_fds(Socket *s) { + SocketPort *p; + int r; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + if (p->event_source) { + r = sd_event_source_set_enabled(p->event_source, SD_EVENT_ON); + if (r < 0) + goto fail; + } else { + r = sd_event_add_io(UNIT(s)->manager->event, &p->event_source, p->fd, EPOLLIN, socket_dispatch_io, p); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(p->event_source, "socket-port-io"); + } + } + + return 0; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to watch listening fds: %m"); + socket_unwatch_fds(s); + return r; +} + +enum { + SOCKET_OPEN_NONE, + SOCKET_OPEN_SOME, + SOCKET_OPEN_ALL, +}; + +static int socket_check_open(Socket *s) { + bool have_open = false, have_closed = false; + SocketPort *p; + + assert(s); + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + have_closed = true; + else + have_open = true; + + if (have_open && have_closed) + return SOCKET_OPEN_SOME; + } + + if (have_open) + return SOCKET_OPEN_ALL; + + return SOCKET_OPEN_NONE; +} + +static void socket_set_state(Socket *s, SocketState state) { + SocketState old_state; + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!IN_SET(state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING)) { + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + socket_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + } + + if (state != SOCKET_LISTENING) + socket_unwatch_fds(s); + + if (!IN_SET(state, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_LISTENING, + SOCKET_RUNNING, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_CLEANING)) + socket_close_fds(s); + + if (state != old_state) + log_unit_debug(UNIT(s), "Changed %s -> %s", socket_state_to_string(old_state), socket_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int socket_coldplug(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(s->state == SOCKET_DEAD); + + if (s->deserialized_state == s->state) + return 0; + + if (s->control_pid > 0 && + pid_is_unwaited(s->control_pid) && + IN_SET(s->deserialized_state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING)) { + + r = unit_watch_pid(UNIT(s), s->control_pid, false); + if (r < 0) + return r; + + r = socket_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec)); + if (r < 0) + return r; + } + + if (IN_SET(s->deserialized_state, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_LISTENING, + SOCKET_RUNNING)) { + + /* Originally, we used to simply reopen all sockets here that we didn't have file descriptors + * for. However, this is problematic, as we won't traverse through the SOCKET_START_CHOWN state for + * them, and thus the UID/GID wouldn't be right. Hence, instead simply check if we have all fds open, + * and if there's a mismatch, warn loudly. */ + + r = socket_check_open(s); + if (r == SOCKET_OPEN_NONE) + log_unit_warning(UNIT(s), + "Socket unit configuration has changed while unit has been running, " + "no open socket file descriptor left. " + "The socket unit is not functional until restarted."); + else if (r == SOCKET_OPEN_SOME) + log_unit_warning(UNIT(s), + "Socket unit configuration has changed while unit has been running, " + "and some socket file descriptors have not been opened yet. " + "The socket unit is not fully functional until restarted."); + } + + if (s->deserialized_state == SOCKET_LISTENING) { + r = socket_watch_fds(s); + if (r < 0) + return r; + } + + if (!IN_SET(s->deserialized_state, SOCKET_DEAD, SOCKET_FAILED, SOCKET_CLEANING)) { + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + socket_set_state(s, s->deserialized_state); + return 0; +} + +static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(s)); + if (r < 0) + return r; + + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + return r; + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + return r; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->dynamic_creds, + &pid); + if (r < 0) + return r; + + r = unit_watch_pid(UNIT(s), pid, true); + if (r < 0) + return r; + + *_pid = pid; + + return 0; +} + +static int socket_chown(Socket *s, pid_t *_pid) { + pid_t pid; + int r; + + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + /* We have to resolve the user names out-of-process, hence + * let's fork here. It's messy, but well, what can we do? */ + + r = unit_fork_helper_process(UNIT(s), "(sd-chown)", &pid); + if (r < 0) + return r; + if (r == 0) { + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + SocketPort *p; + + /* Child */ + + if (!isempty(s->user)) { + const char *user = s->user; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user); + _exit(EXIT_USER); + } + } + + if (!isempty(s->group)) { + const char *group = s->group; + + r = get_group_creds(&group, &gid, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group); + _exit(EXIT_GROUP); + } + } + + LIST_FOREACH(port, p, s->ports) { + const char *path = NULL; + + if (p->type == SOCKET_SOCKET) + path = socket_address_get_path(&p->address); + else if (p->type == SOCKET_FIFO) + path = p->path; + + if (!path) + continue; + + if (chown(path, uid, gid) < 0) { + log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m"); + _exit(EXIT_CHOWN); + } + } + + _exit(EXIT_SUCCESS); + } + + r = unit_watch_pid(UNIT(s), pid, true); + if (r < 0) + goto fail; + + *_pid = pid; + return 0; + +fail: + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + return r; +} + +static void socket_enter_dead(Socket *s, SocketResult f) { + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + if (s->result == SOCKET_SUCCESS) + unit_log_success(UNIT(s)); + else + unit_log_failure(UNIT(s), socket_result_to_string(s->result)); + + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop); + + socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, true); + + unit_destroy_runtime_data(UNIT(s), &s->exec_context); + + unit_unref_uid_gid(UNIT(s), true); + + dynamic_creds_destroy(&s->dynamic_creds); +} + +static void socket_enter_signal(Socket *s, SocketState state, SocketResult f); + +static void socket_enter_stop_post(Socket *s, SocketResult f) { + int r; + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_STOP_POST; + s->control_command = s->exec_command[SOCKET_EXEC_STOP_POST]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + socket_set_state(s, SOCKET_STOP_POST); + } else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-post' task: %m"); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES); +} + +static int state_to_kill_operation(Socket *s, SocketState state) { + if (state == SOCKET_STOP_PRE_SIGTERM && unit_has_job_type(UNIT(s), JOB_RESTART)) + return KILL_RESTART; + + if (state == SOCKET_FINAL_SIGTERM) + return KILL_TERMINATE; + + return KILL_KILL; +} + +static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) { + int r; + + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + r = unit_kill_context( + UNIT(s), + &s->kill_context, + state_to_kill_operation(s, state), + -1, + s->control_pid, + false); + if (r < 0) + goto fail; + + if (r > 0) { + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + socket_set_state(s, state); + } else if (state == SOCKET_STOP_PRE_SIGTERM) + socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_SUCCESS); + else if (state == SOCKET_STOP_PRE_SIGKILL) + socket_enter_stop_post(s, SOCKET_SUCCESS); + else if (state == SOCKET_FINAL_SIGTERM) + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS); + else + socket_enter_dead(s, SOCKET_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + + if (IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_STOP_PRE_SIGKILL)) + socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES); + else + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_stop_pre(Socket *s, SocketResult f) { + int r; + assert(s); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_STOP_PRE; + s->control_command = s->exec_command[SOCKET_EXEC_STOP_PRE]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + socket_set_state(s, SOCKET_STOP_PRE); + } else + socket_enter_stop_post(s, SOCKET_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-pre' task: %m"); + socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_listening(Socket *s) { + int r; + assert(s); + + if (!s->accept && s->flush_pending) { + log_unit_debug(UNIT(s), "Flushing socket before listening."); + flush_ports(s); + } + + r = socket_watch_fds(s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to watch sockets: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_LISTENING); + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_post(Socket *s) { + int r; + assert(s); + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_START_POST; + s->control_command = s->exec_command[SOCKET_EXEC_START_POST]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-post' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_POST); + } else + socket_enter_listening(s); + + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_chown(Socket *s) { + int r; + + assert(s); + + r = socket_open_fds(s); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to listen on sockets: %m"); + goto fail; + } + + if (!isempty(s->user) || !isempty(s->group)) { + + socket_unwatch_control_pid(s); + s->control_command_id = SOCKET_EXEC_START_CHOWN; + s->control_command = NULL; + + r = socket_chown(s, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to fork 'start-chown' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_CHOWN); + } else + socket_enter_start_post(s); + + return; + +fail: + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_enter_start_pre(Socket *s) { + int r; + assert(s); + + socket_unwatch_control_pid(s); + + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start); + + s->control_command_id = SOCKET_EXEC_START_PRE; + s->control_command = s->exec_command[SOCKET_EXEC_START_PRE]; + + if (s->control_command) { + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) { + log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-pre' task: %m"); + goto fail; + } + + socket_set_state(s, SOCKET_START_PRE); + } else + socket_enter_start_chown(s); + + return; + +fail: + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); +} + +static void flush_ports(Socket *s) { + SocketPort *p; + + /* Flush all incoming traffic, regardless if actual bytes or new connections, so that this socket isn't busy + * anymore */ + + LIST_FOREACH(port, p, s->ports) { + if (p->fd < 0) + continue; + + (void) flush_accept(p->fd); + (void) flush_fd(p->fd); + } +} + +static void socket_enter_running(Socket *s, int cfd_in) { + /* Note that this call takes possession of the connection fd passed. It either has to assign it + * somewhere or close it. */ + _cleanup_close_ int cfd = cfd_in; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + /* We don't take connections anymore if we are supposed to shut down anyway */ + if (unit_stop_pending(UNIT(s))) { + + log_unit_debug(UNIT(s), "Suppressing connection request since unit stop is scheduled."); + + if (cfd >= 0) + goto refuse; + + flush_ports(s); + return; + } + + if (!ratelimit_below(&s->trigger_limit)) { + log_unit_warning(UNIT(s), "Trigger limit hit, refusing further activation."); + socket_enter_stop_pre(s, SOCKET_FAILURE_TRIGGER_LIMIT_HIT); + goto refuse; + } + + if (cfd < 0) { + bool pending = false; + Unit *other; + void *v; + + /* If there's already a start pending don't bother to + * do anything */ + HASHMAP_FOREACH_KEY(v, other, UNIT(s)->dependencies[UNIT_TRIGGERS]) + if (unit_active_or_pending(other)) { + pending = true; + break; + } + + if (!pending) { + if (!UNIT_ISSET(s->service)) { + r = log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), + "Service to activate vanished, refusing activation."); + goto fail; + } + + r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT_DEREF(s->service), JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto fail; + } + + socket_set_state(s, SOCKET_RUNNING); + } else { + _cleanup_(socket_peer_unrefp) SocketPeer *p = NULL; + Unit *service; + + if (s->n_connections >= s->max_connections) { + log_unit_warning(UNIT(s), "Too many incoming connections (%u), dropping connection.", + s->n_connections); + goto refuse; + } + + if (s->max_connections_per_source > 0) { + r = socket_acquire_peer(s, cfd, &p); + if (ERRNO_IS_DISCONNECT(r)) + return; + if (r < 0) /* We didn't have enough resources to acquire peer information, let's fail. */ + goto fail; + if (r > 0 && p->n_ref > s->max_connections_per_source) { + _cleanup_free_ char *t = NULL; + + (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t); + + log_unit_warning(UNIT(s), + "Too many incoming connections (%u) from source %s, dropping connection.", + p->n_ref, strnull(t)); + goto refuse; + } + } + + r = socket_load_service_unit(s, cfd, &service); + if (ERRNO_IS_DISCONNECT(r)) + return; + if (r < 0) + goto fail; + + r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service, + false, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + goto fail; + + s->n_accepted++; + + r = service_set_socket_fd(SERVICE(service), cfd, s, s->selinux_context_from_net); + if (ERRNO_IS_DISCONNECT(r)) + return; + if (r < 0) + goto fail; + + TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */ + s->n_connections++; + + SERVICE(service)->peer = TAKE_PTR(p); /* Pass ownership of the peer reference */ + + r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) { + /* We failed to activate the new service, but it still exists. Let's make sure the + * service closes and forgets the connection fd again, immediately. */ + service_close_socket_fd(SERVICE(service)); + goto fail; + } + + /* Notify clients about changed counters */ + unit_add_to_dbus_queue(UNIT(s)); + } + + TAKE_FD(cfd); + return; + +refuse: + s->n_refused++; + return; + +fail: + if (ERRNO_IS_RESOURCE(r)) + log_unit_warning(UNIT(s), "Failed to queue service startup job: %s", + bus_error_message(&error, r)); + else + log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s", + cfd >= 0 ? "template" : "non-template", + bus_error_message(&error, r)); + + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); +} + +static void socket_run_next(Socket *s) { + int r; + + assert(s); + assert(s->control_command); + assert(s->control_command->command_next); + + socket_unwatch_control_pid(s); + + s->control_command = s->control_command->command_next; + + r = socket_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run next task: %m"); + + if (s->state == SOCKET_START_POST) + socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); + else if (s->state == SOCKET_STOP_POST) + socket_enter_dead(s, SOCKET_FAILURE_RESOURCES); + else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES); +} + +static int socket_start(Unit *u) { + Socket *s = SOCKET(u); + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later + * please! */ + if (IN_SET(s->state, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (IN_SET(s->state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST)) + return 0; + + /* Cannot run this without the service being around */ + if (UNIT_ISSET(s->service)) { + Service *service; + + service = SERVICE(UNIT_DEREF(s->service)); + + if (UNIT(service)->load_state != UNIT_LOADED) { + log_unit_error(u, "Socket service %s not loaded, refusing.", UNIT(service)->id); + return -ENOENT; + } + + /* If the service is already active we cannot start the + * socket */ + if (!IN_SET(service->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { + log_unit_error(u, "Socket service %s already active, refusing.", UNIT(service)->id); + return -EBUSY; + } + } + + assert(IN_SET(s->state, SOCKET_DEAD, SOCKET_FAILED)); + + r = unit_test_start_limit(u); + if (r < 0) { + socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + s->result = SOCKET_SUCCESS; + exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); + + u->reset_accounting = true; + + socket_enter_start_pre(s); + return 1; +} + +static int socket_stop(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + /* Already on it */ + if (IN_SET(s->state, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL)) + return 0; + + /* If there's already something running we go directly into + * kill mode. */ + if (IN_SET(s->state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST)) { + socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_SUCCESS); + return -EAGAIN; + } + + /* If we are currently cleaning, then abort it, brutally. */ + if (s->state == SOCKET_CLEANING) { + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS); + return 0; + } + + assert(IN_SET(s->state, SOCKET_LISTENING, SOCKET_RUNNING)); + + socket_enter_stop_pre(s, SOCKET_SUCCESS); + return 1; +} + +static int socket_serialize(Unit *u, FILE *f, FDSet *fds) { + Socket *s = SOCKET(u); + SocketPort *p; + int r; + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", socket_state_to_string(s->state)); + (void) serialize_item(f, "result", socket_result_to_string(s->result)); + (void) serialize_item_format(f, "n-accepted", "%u", s->n_accepted); + (void) serialize_item_format(f, "n-refused", "%u", s->n_refused); + + if (s->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid); + + if (s->control_command_id >= 0) + (void) serialize_item(f, "control-command", socket_exec_command_to_string(s->control_command_id)); + + LIST_FOREACH(port, p, s->ports) { + int copy; + + if (p->fd < 0) + continue; + + copy = fdset_put_dup(fds, p->fd); + if (copy < 0) + return log_unit_warning_errno(u, copy, "Failed to serialize socket fd: %m"); + + if (p->type == SOCKET_SOCKET) { + _cleanup_free_ char *t = NULL; + + r = socket_address_print(&p->address, &t); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to format socket address: %m"); + + if (socket_address_family(&p->address) == AF_NETLINK) + (void) serialize_item_format(f, "netlink", "%i %s", copy, t); + else + (void) serialize_item_format(f, "socket", "%i %i %s", copy, p->address.type, t); + } else if (p->type == SOCKET_SPECIAL) + (void) serialize_item_format(f, "special", "%i %s", copy, p->path); + else if (p->type == SOCKET_MQUEUE) + (void) serialize_item_format(f, "mqueue", "%i %s", copy, p->path); + else if (p->type == SOCKET_USB_FUNCTION) + (void) serialize_item_format(f, "ffs", "%i %s", copy, p->path); + else { + assert(p->type == SOCKET_FIFO); + (void) serialize_item_format(f, "fifo", "%i %s", copy, p->path); + } + } + + return 0; +} + +static void socket_port_take_fd(SocketPort *p, FDSet *fds, int fd) { + assert(p); + + safe_close(p->fd); + p->fd = fdset_remove(fds, fd); +} + +static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Socket *s = SOCKET(u); + + assert(u); + assert(key); + assert(value); + + if (streq(key, "state")) { + SocketState state; + + state = socket_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + SocketResult f; + + f = socket_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SOCKET_SUCCESS) + s->result = f; + + } else if (streq(key, "n-accepted")) { + unsigned k; + + if (safe_atou(value, &k) < 0) + log_unit_debug(u, "Failed to parse n-accepted value: %s", value); + else + s->n_accepted += k; + } else if (streq(key, "n-refused")) { + unsigned k; + + if (safe_atou(value, &k) < 0) + log_unit_debug(u, "Failed to parse n-refused value: %s", value); + else + s->n_refused += k; + } else if (streq(key, "control-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + else + s->control_pid = pid; + } else if (streq(key, "control-command")) { + SocketExecCommand id; + + id = socket_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + s->control_command_id = id; + s->control_command = s->exec_command[id]; + } + } else if (streq(key, "fifo")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse fifo value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_FIFO && + path_equal_or_files_same(p->path, value+skip, 0)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "special")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse special value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_SPECIAL && + path_equal_or_files_same(p->path, value+skip, 0)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "mqueue")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse mqueue value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_MQUEUE && + streq(p->path, value+skip)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "socket")) { + int fd, type, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %i %n", &fd, &type, &skip) < 2 || fd < 0 || type < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse socket value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (socket_address_is(&p->address, value+skip, type)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "netlink")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse socket value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (socket_address_is_netlink(&p->address, value+skip)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else if (streq(key, "ffs")) { + int fd, skip = 0; + SocketPort *p; + + if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse ffs value: %s", value); + else + LIST_FOREACH(port, p, s->ports) + if (p->type == SOCKET_USB_FUNCTION && + path_equal_or_files_same(p->path, value+skip, 0)) { + socket_port_take_fd(p, fds, fd); + break; + } + + } else + log_unit_debug(UNIT(s), "Unknown serialization key: %s", key); + + return 0; +} + +static void socket_distribute_fds(Unit *u, FDSet *fds) { + Socket *s = SOCKET(u); + SocketPort *p; + + assert(u); + + LIST_FOREACH(port, p, s->ports) { + int fd; + + if (p->type != SOCKET_SOCKET) + continue; + + if (p->fd >= 0) + continue; + + FDSET_FOREACH(fd, fds) { + if (socket_address_matches_fd(&p->address, fd)) { + p->fd = fdset_remove(fds, fd); + s->deserialized_state = SOCKET_LISTENING; + break; + } + } + } +} + +_pure_ static UnitActiveState socket_active_state(Unit *u) { + assert(u); + + return state_translation_table[SOCKET(u)->state]; +} + +_pure_ static const char *socket_sub_state_to_string(Unit *u) { + assert(u); + + return socket_state_to_string(SOCKET(u)->state); +} + +const char* socket_port_type_to_string(SocketPort *p) { + + assert(p); + + switch (p->type) { + + case SOCKET_SOCKET: + + switch (p->address.type) { + + case SOCK_STREAM: + return "Stream"; + + case SOCK_DGRAM: + return "Datagram"; + + case SOCK_SEQPACKET: + return "SequentialPacket"; + + case SOCK_RAW: + if (socket_address_family(&p->address) == AF_NETLINK) + return "Netlink"; + + _fallthrough_; + default: + return NULL; + } + + case SOCKET_SPECIAL: + return "Special"; + + case SOCKET_MQUEUE: + return "MessageQueue"; + + case SOCKET_FIFO: + return "FIFO"; + + case SOCKET_USB_FUNCTION: + return "USBFunction"; + + default: + return NULL; + } +} + +SocketType socket_port_type_from_string(const char *s) { + assert(s); + + if (STR_IN_SET(s, "Stream", "Datagram", "SequentialPacket", "Netlink")) + return SOCKET_SOCKET; + else if (streq(s, "Special")) + return SOCKET_SPECIAL; + else if (streq(s, "MessageQueue")) + return SOCKET_MQUEUE; + else if (streq(s, "FIFO")) + return SOCKET_FIFO; + else if (streq(s, "USBFunction")) + return SOCKET_USB_FUNCTION; + else + return _SOCKET_TYPE_INVALID; +} + +_pure_ static bool socket_may_gc(Unit *u) { + Socket *s = SOCKET(u); + + assert(u); + + return s->n_connections == 0; +} + +static int socket_accept_do(Socket *s, int fd) { + int cfd; + + assert(s); + assert(fd >= 0); + + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (cfd < 0) + /* Convert transient network errors into clean and well-defined EAGAIN */ + return ERRNO_IS_ACCEPT_AGAIN(errno) ? -EAGAIN : -errno; + + return cfd; +} + +static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) { + _cleanup_close_pair_ int pair[2] = { -1, -1 }; + int cfd, r; + pid_t pid; + + assert(s); + assert(p); + assert(fd >= 0); + + /* Similar to socket_address_listen_in_cgroup(), but for accept() rather than socket(): make sure that any + * connection socket is also properly associated with the cgroup. */ + + if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6)) + goto shortcut; + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == BPF_FIREWALL_UNSUPPORTED) + goto shortcut; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), "(sd-accept)", &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + cfd = socket_accept_do(s, fd); + if (cfd == -EAGAIN) /* spurious accept() */ + _exit(EXIT_SUCCESS); + if (cfd < 0) { + log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], cfd, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + cfd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_check("(sd-accept)", pid, WAIT_LOG_ABNORMAL); + if (r < 0) { + safe_close(cfd); + return r; + } + + /* If we received no fd, we got EIO here. If this happens with a process exit code of EXIT_SUCCESS + * this is a spurious accept(), let's convert that back to EAGAIN here. */ + if (cfd == -EIO) + return -EAGAIN; + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m"); + + return cfd; + +shortcut: + cfd = socket_accept_do(s, fd); + if (cfd == -EAGAIN) /* spurious accept(), skip it silently */ + return -EAGAIN; + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + + return cfd; +} + +static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + SocketPort *p = userdata; + int cfd = -1; + + assert(p); + assert(fd >= 0); + + if (p->socket->state != SOCKET_LISTENING) + return 0; + + log_unit_debug(UNIT(p->socket), "Incoming traffic"); + + if (revents != EPOLLIN) { + if (revents & EPOLLHUP) + log_unit_error(UNIT(p->socket), "Got POLLHUP on a listening socket. The service probably invoked shutdown() on it, and should better not do that."); + else + log_unit_error(UNIT(p->socket), "Got unexpected poll event (0x%x) on socket.", revents); + goto fail; + } + + if (p->socket->accept && + p->type == SOCKET_SOCKET && + socket_address_can_accept(&p->address)) { + + cfd = socket_accept_in_cgroup(p->socket, p, fd); + if (cfd == -EAGAIN) /* Spurious accept() */ + return 0; + if (cfd < 0) + goto fail; + + socket_apply_socket_options(p->socket, p, cfd); + } + + socket_enter_running(p->socket, cfd); + return 0; + +fail: + socket_enter_stop_pre(p->socket, SOCKET_FAILURE_RESOURCES); + return 0; +} + +static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Socket *s = SOCKET(u); + SocketResult f; + + assert(s); + assert(pid >= 0); + + if (pid != s->control_pid) + return; + + s->control_pid = 0; + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = SOCKET_SUCCESS; + else if (code == CLD_EXITED) + f = SOCKET_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SOCKET_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SOCKET_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown sigchld code"); + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE) + f = SOCKET_SUCCESS; + } + + unit_log_process_exit( + u, + "Control process", + socket_exec_command_to_string(s->control_command_id), + f == SOCKET_SUCCESS, + code, status); + + if (s->result == SOCKET_SUCCESS) + s->result = f; + + if (s->control_command && + s->control_command->command_next && + f == SOCKET_SUCCESS) { + + log_unit_debug(u, "Running next command for state %s", socket_state_to_string(s->state)); + socket_run_next(s); + } else { + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + /* No further commands for this step, so let's figure + * out what to do next */ + + log_unit_debug(u, "Got final SIGCHLD for state %s", socket_state_to_string(s->state)); + + switch (s->state) { + + case SOCKET_START_PRE: + if (f == SOCKET_SUCCESS) + socket_enter_start_chown(s); + else + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, f); + break; + + case SOCKET_START_CHOWN: + if (f == SOCKET_SUCCESS) + socket_enter_start_post(s); + else + socket_enter_stop_pre(s, f); + break; + + case SOCKET_START_POST: + if (f == SOCKET_SUCCESS) + socket_enter_listening(s); + else + socket_enter_stop_pre(s, f); + break; + + case SOCKET_STOP_PRE: + case SOCKET_STOP_PRE_SIGTERM: + case SOCKET_STOP_PRE_SIGKILL: + socket_enter_stop_post(s, f); + break; + + case SOCKET_STOP_POST: + case SOCKET_FINAL_SIGTERM: + case SOCKET_FINAL_SIGKILL: + socket_enter_dead(s, f); + break; + + case SOCKET_CLEANING: + + if (s->clean_result == SOCKET_SUCCESS) + s->clean_result = f; + + socket_enter_dead(s, SOCKET_SUCCESS); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Socket *s = SOCKET(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SOCKET_START_PRE: + log_unit_warning(UNIT(s), "Starting timed out. Terminating."); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_START_CHOWN: + case SOCKET_START_POST: + log_unit_warning(UNIT(s), "Starting timed out. Stopping."); + socket_enter_stop_pre(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_PRE: + log_unit_warning(UNIT(s), "Stopping timed out. Terminating."); + socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_PRE_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out. Killing."); + socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL. Ignoring."); + socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT); + } + break; + + case SOCKET_STOP_PRE_SIGKILL: + log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring."); + socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_STOP_POST: + log_unit_warning(UNIT(s), "Stopping timed out (2). Terminating."); + socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_FINAL_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Stopping timed out (2). Killing."); + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Stopping timed out (2). Skipping SIGKILL. Ignoring."); + socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT); + } + break; + + case SOCKET_FINAL_SIGKILL: + log_unit_warning(UNIT(s), "Still around after SIGKILL (2). Entering failed mode."); + socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT); + break; + + case SOCKET_CLEANING: + log_unit_warning(UNIT(s), "Cleaning timed out. killing."); + + if (s->clean_result == SOCKET_SUCCESS) + s->clean_result = SOCKET_FAILURE_TIMEOUT; + + socket_enter_signal(s, SOCKET_FINAL_SIGKILL, 0); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +int socket_collect_fds(Socket *s, int **fds) { + size_t k = 0, n = 0; + SocketPort *p; + int *rfds; + + assert(s); + assert(fds); + + /* Called from the service code for requesting our fds */ + + LIST_FOREACH(port, p, s->ports) { + if (p->fd >= 0) + n++; + n += p->n_auxiliary_fds; + } + + if (n <= 0) { + *fds = NULL; + return 0; + } + + rfds = new(int, n); + if (!rfds) + return -ENOMEM; + + LIST_FOREACH(port, p, s->ports) { + size_t i; + + if (p->fd >= 0) + rfds[k++] = p->fd; + for (i = 0; i < p->n_auxiliary_fds; ++i) + rfds[k++] = p->auxiliary_fds[i]; + } + + assert(k == n); + + *fds = rfds; + return (int) n; +} + +static void socket_reset_failed(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + if (s->state == SOCKET_FAILED) + socket_set_state(s, SOCKET_DEAD); + + s->result = SOCKET_SUCCESS; + s->clean_result = SOCKET_SUCCESS; +} + +void socket_connection_unref(Socket *s) { + assert(s); + + /* The service is dead. Yay! + * + * This is strictly for one-instance-per-connection + * services. */ + + assert(s->n_connections > 0); + s->n_connections--; + + log_unit_debug(UNIT(s), "One connection closed, %u left.", s->n_connections); +} + +static void socket_trigger_notify(Unit *u, Unit *other) { + Socket *s = SOCKET(u); + + assert(u); + assert(other); + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + assert(other->type == UNIT_SERVICE); + + /* Don't propagate state changes from the service if we are already down */ + if (!IN_SET(s->state, SOCKET_RUNNING, SOCKET_LISTENING)) + return; + + /* We don't care for the service state if we are in Accept=yes mode */ + if (s->accept) + return; + + /* Propagate start limit hit state */ + if (other->start_limit_hit) { + socket_enter_stop_pre(s, SOCKET_FAILURE_SERVICE_START_LIMIT_HIT); + return; + } + + /* Don't propagate anything if there's still a job queued */ + if (other->job) + return; + + if (IN_SET(SERVICE(other)->state, + SERVICE_DEAD, SERVICE_FAILED, + SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_AUTO_RESTART)) + socket_enter_listening(s); + + if (SERVICE(other)->state == SERVICE_RUNNING) + socket_set_state(s, SOCKET_RUNNING); +} + +static int socket_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, SOCKET(u)->control_pid, error); +} + +static int socket_get_timeout(Unit *u, usec_t *timeout) { + Socket *s = SOCKET(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +char *socket_fdname(Socket *s) { + assert(s); + + /* Returns the name to use for $LISTEN_NAMES. If the user + * didn't specify anything specifically, use the socket unit's + * name as fallback. */ + + return s->fdname ?: UNIT(s)->id; +} + +static int socket_control_pid(Unit *u) { + Socket *s = SOCKET(u); + + assert(s); + + return s->control_pid; +} + +static int socket_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Socket *s = SOCKET(u); + int r; + + assert(s); + assert(mask != 0); + + if (s->state != SOCKET_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + socket_unwatch_control_pid(s); + s->clean_result = SOCKET_SUCCESS; + s->control_command = NULL; + s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID; + + r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->exec_context.timeout_clean_usec)); + if (r < 0) + goto fail; + + r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); + if (r < 0) + goto fail; + + socket_set_state(s, SOCKET_CLEANING); + + return 0; + +fail: + log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m"); + s->clean_result = SOCKET_FAILURE_RESOURCES; + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + return r; +} + +static int socket_can_clean(Unit *u, ExecCleanMask *ret) { + Socket *s = SOCKET(u); + + assert(s); + + return exec_context_get_clean_mask(&s->exec_context, ret); +} + +static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = { + [SOCKET_EXEC_START_PRE] = "ExecStartPre", + [SOCKET_EXEC_START_CHOWN] = "ExecStartChown", + [SOCKET_EXEC_START_POST] = "ExecStartPost", + [SOCKET_EXEC_STOP_PRE] = "ExecStopPre", + [SOCKET_EXEC_STOP_POST] = "ExecStopPost" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand); + +static const char* const socket_result_table[_SOCKET_RESULT_MAX] = { + [SOCKET_SUCCESS] = "success", + [SOCKET_FAILURE_RESOURCES] = "resources", + [SOCKET_FAILURE_TIMEOUT] = "timeout", + [SOCKET_FAILURE_EXIT_CODE] = "exit-code", + [SOCKET_FAILURE_SIGNAL] = "signal", + [SOCKET_FAILURE_CORE_DUMP] = "core-dump", + [SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit", + [SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit", + [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult); + +static const char* const socket_timestamping_table[_SOCKET_TIMESTAMPING_MAX] = { + [SOCKET_TIMESTAMPING_OFF] = "off", + [SOCKET_TIMESTAMPING_US] = "us", + [SOCKET_TIMESTAMPING_NS] = "ns", +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_timestamping, SocketTimestamping); + +SocketTimestamping socket_timestamping_from_string_harder(const char *p) { + SocketTimestamping t; + int r; + + if (!p) + return _SOCKET_TIMESTAMPING_INVALID; + + t = socket_timestamping_from_string(p); + if (t >= 0) + return t; + + /* Let's alternatively support the various other aliases parse_time() accepts for ns and µs here, + * too. */ + if (streq(p, "nsec")) + return SOCKET_TIMESTAMPING_NS; + if (STR_IN_SET(p, "usec", "µs")) + return SOCKET_TIMESTAMPING_US; + + r = parse_boolean(p); + if (r < 0) + return _SOCKET_TIMESTAMPING_INVALID; + + return r ? SOCKET_TIMESTAMPING_NS : SOCKET_TIMESTAMPING_OFF; /* If boolean yes, default to ns accuracy */ +} + +const UnitVTable socket_vtable = { + .object_size = sizeof(Socket), + .exec_context_offset = offsetof(Socket, exec_context), + .cgroup_context_offset = offsetof(Socket, cgroup_context), + .kill_context_offset = offsetof(Socket, kill_context), + .exec_runtime_offset = offsetof(Socket, exec_runtime), + .dynamic_creds_offset = offsetof(Socket, dynamic_creds), + + .sections = + "Unit\0" + "Socket\0" + "Install\0", + .private_section = "Socket", + + .can_transient = true, + .can_trigger = true, + .can_fail = true, + + .init = socket_init, + .done = socket_done, + .load = socket_load, + + .coldplug = socket_coldplug, + + .dump = socket_dump, + + .start = socket_start, + .stop = socket_stop, + + .kill = socket_kill, + .clean = socket_clean, + .can_clean = socket_can_clean, + + .get_timeout = socket_get_timeout, + + .serialize = socket_serialize, + .deserialize_item = socket_deserialize_item, + .distribute_fds = socket_distribute_fds, + + .active_state = socket_active_state, + .sub_state_to_string = socket_sub_state_to_string, + + .will_restart = unit_will_restart_default, + + .may_gc = socket_may_gc, + + .sigchld_event = socket_sigchld_event, + + .trigger_notify = socket_trigger_notify, + + .reset_failed = socket_reset_failed, + + .control_pid = socket_control_pid, + + .bus_set_property = bus_socket_set_property, + .bus_commit_properties = bus_socket_commit_properties, + + .status_message_formats = { + /*.starting_stopping = { + [0] = "Starting socket %s...", + [1] = "Stopping socket %s...", + },*/ + .finished_start_job = { + [JOB_DONE] = "Listening on %s.", + [JOB_FAILED] = "Failed to listen on %s.", + [JOB_TIMEOUT] = "Timed out starting %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Closed %s.", + [JOB_FAILED] = "Failed stopping %s.", + [JOB_TIMEOUT] = "Timed out stopping %s.", + }, + }, +}; diff --git a/src/core/socket.h b/src/core/socket.h new file mode 100644 index 0000000..ebe85c2 --- /dev/null +++ b/src/core/socket.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Socket Socket; +typedef struct SocketPeer SocketPeer; + +#include "mount.h" +#include "service.h" +#include "socket-util.h" +#include "unit.h" + +typedef enum SocketExecCommand { + SOCKET_EXEC_START_PRE, + SOCKET_EXEC_START_CHOWN, + SOCKET_EXEC_START_POST, + SOCKET_EXEC_STOP_PRE, + SOCKET_EXEC_STOP_POST, + _SOCKET_EXEC_COMMAND_MAX, + _SOCKET_EXEC_COMMAND_INVALID = -1 +} SocketExecCommand; + +typedef enum SocketType { + SOCKET_SOCKET, + SOCKET_FIFO, + SOCKET_SPECIAL, + SOCKET_MQUEUE, + SOCKET_USB_FUNCTION, + _SOCKET_TYPE_MAX, + _SOCKET_TYPE_INVALID = -1 +} SocketType; + +typedef enum SocketResult { + SOCKET_SUCCESS, + SOCKET_FAILURE_RESOURCES, + SOCKET_FAILURE_TIMEOUT, + SOCKET_FAILURE_EXIT_CODE, + SOCKET_FAILURE_SIGNAL, + SOCKET_FAILURE_CORE_DUMP, + SOCKET_FAILURE_START_LIMIT_HIT, + SOCKET_FAILURE_TRIGGER_LIMIT_HIT, + SOCKET_FAILURE_SERVICE_START_LIMIT_HIT, + _SOCKET_RESULT_MAX, + _SOCKET_RESULT_INVALID = -1 +} SocketResult; + +typedef struct SocketPort { + Socket *socket; + + SocketType type; + int fd; + int *auxiliary_fds; + size_t n_auxiliary_fds; + + SocketAddress address; + char *path; + sd_event_source *event_source; + + LIST_FIELDS(struct SocketPort, port); +} SocketPort; + +typedef enum SocketTimestamping { + SOCKET_TIMESTAMPING_OFF, + SOCKET_TIMESTAMPING_US, /* SO_TIMESTAMP */ + SOCKET_TIMESTAMPING_NS, /* SO_TIMESTAMPNS */ + _SOCKET_TIMESTAMPING_MAX, + _SOCKET_TIMESTAMPING_INVALID = -1, +} SocketTimestamping; + +struct Socket { + Unit meta; + + LIST_HEAD(SocketPort, ports); + + Set *peers_by_address; + + unsigned n_accepted; + unsigned n_connections; + unsigned n_refused; + unsigned max_connections; + unsigned max_connections_per_source; + + unsigned backlog; + unsigned keep_alive_cnt; + usec_t timeout_usec; + usec_t keep_alive_time; + usec_t keep_alive_interval; + usec_t defer_accept; + + ExecCommand* exec_command[_SOCKET_EXEC_COMMAND_MAX]; + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + /* For Accept=no sockets refers to the one service we'll + * activate. For Accept=yes sockets is either NULL, or filled + * to refer to the next service we spawn. */ + UnitRef service; + + SocketState state, deserialized_state; + + sd_event_source *timer_event_source; + + ExecCommand* control_command; + SocketExecCommand control_command_id; + pid_t control_pid; + + mode_t directory_mode; + mode_t socket_mode; + + SocketResult result; + SocketResult clean_result; + + char **symlinks; + + bool accept; + bool remove_on_stop; + bool writable; + bool flush_pending; + + int socket_protocol; + + /* Socket options */ + bool keep_alive; + bool no_delay; + bool free_bind; + bool transparent; + bool broadcast; + bool pass_cred; + bool pass_sec; + bool pass_pktinfo; + SocketTimestamping timestamping; + + /* Only for INET6 sockets: issue IPV6_V6ONLY sockopt */ + SocketAddressBindIPv6Only bind_ipv6_only; + + int priority; + int mark; + size_t receive_buffer; + size_t send_buffer; + int ip_tos; + int ip_ttl; + size_t pipe_size; + char *bind_to_device; + char *tcp_congestion; + bool reuse_port; + long mq_maxmsg; + long mq_msgsize; + + char *smack; + char *smack_ip_in; + char *smack_ip_out; + + bool selinux_context_from_net; + + char *user, *group; + + char *fdname; + + RateLimit trigger_limit; +}; + +SocketPeer *socket_peer_ref(SocketPeer *p); +SocketPeer *socket_peer_unref(SocketPeer *p); +int socket_acquire_peer(Socket *s, int fd, SocketPeer **p); + +DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref); + +/* Called from the service code when collecting fds */ +int socket_collect_fds(Socket *s, int **fds); + +/* Called from the service code when a per-connection service ended */ +void socket_connection_unref(Socket *s); + +void socket_free_ports(Socket *s); + +int socket_load_service_unit(Socket *s, int cfd, Unit **ret); + +char *socket_fdname(Socket *s); + +extern const UnitVTable socket_vtable; + +const char* socket_exec_command_to_string(SocketExecCommand i) _const_; +SocketExecCommand socket_exec_command_from_string(const char *s) _pure_; + +const char* socket_result_to_string(SocketResult i) _const_; +SocketResult socket_result_from_string(const char *s) _pure_; + +const char* socket_port_type_to_string(SocketPort *p) _pure_; +SocketType socket_port_type_from_string(const char *p) _pure_; + +const char* socket_timestamping_to_string(SocketTimestamping p) _const_; +SocketTimestamping socket_timestamping_from_string(const char *p) _pure_; +SocketTimestamping socket_timestamping_from_string_harder(const char *p) _pure_; + +DEFINE_CAST(SOCKET, Socket); diff --git a/src/core/swap.c b/src/core/swap.c new file mode 100644 index 0000000..76e491a --- /dev/null +++ b/src/core/swap.c @@ -0,0 +1,1694 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <sys/epoll.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "sd-device.h" + +#include "alloc-util.h" +#include "dbus-swap.h" +#include "dbus-unit.h" +#include "device-private.h" +#include "device-util.h" +#include "device.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "fstab-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "swap.h" +#include "unit-name.h" +#include "unit.h" +#include "virt.h" + +static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = { + [SWAP_DEAD] = UNIT_INACTIVE, + [SWAP_ACTIVATING] = UNIT_ACTIVATING, + [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE, + [SWAP_ACTIVE] = UNIT_ACTIVE, + [SWAP_DEACTIVATING] = UNIT_DEACTIVATING, + [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING, + [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING, + [SWAP_FAILED] = UNIT_FAILED, + [SWAP_CLEANING] = UNIT_MAINTENANCE, +}; + +static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); +static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int swap_process_proc_swaps(Manager *m); + +static bool SWAP_STATE_WITH_PROCESS(SwapState state) { + return IN_SET(state, + SWAP_ACTIVATING, + SWAP_ACTIVATING_DONE, + SWAP_DEACTIVATING, + SWAP_DEACTIVATING_SIGTERM, + SWAP_DEACTIVATING_SIGKILL, + SWAP_CLEANING); +} + +_pure_ static UnitActiveState swap_active_state(Unit *u) { + assert(u); + + return state_translation_table[SWAP(u)->state]; +} + +_pure_ static const char *swap_sub_state_to_string(Unit *u) { + assert(u); + + return swap_state_to_string(SWAP(u)->state); +} + +_pure_ static bool swap_may_gc(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + if (s->from_proc_swaps) + return false; + + return true; +} + +_pure_ static bool swap_is_extrinsic(Unit *u) { + assert(SWAP(u)); + + return MANAGER_IS_USER(u->manager); +} + +static void swap_unset_proc_swaps(Swap *s) { + assert(s); + + if (!s->from_proc_swaps) + return; + + s->parameters_proc_swaps.what = mfree(s->parameters_proc_swaps.what); + s->from_proc_swaps = false; +} + +static int swap_set_devnode(Swap *s, const char *devnode) { + Hashmap *swaps; + Swap *first; + int r; + + assert(s); + + r = hashmap_ensure_allocated(&UNIT(s)->manager->swaps_by_devnode, &path_hash_ops); + if (r < 0) + return r; + + swaps = UNIT(s)->manager->swaps_by_devnode; + + if (s->devnode) { + first = hashmap_get(swaps, s->devnode); + + LIST_REMOVE(same_devnode, first, s); + if (first) + hashmap_replace(swaps, first->devnode, first); + else + hashmap_remove(swaps, s->devnode); + + s->devnode = mfree(s->devnode); + } + + if (devnode) { + s->devnode = strdup(devnode); + if (!s->devnode) + return -ENOMEM; + + first = hashmap_get(swaps, s->devnode); + LIST_PREPEND(same_devnode, first, s); + + return hashmap_replace(swaps, first->devnode, first); + } + + return 0; +} + +static void swap_init(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + assert(UNIT(s)->load_state == UNIT_STUB); + + s->timeout_usec = u->manager->default_timeout_start_usec; + + s->exec_context.std_output = u->manager->default_std_output; + s->exec_context.std_error = u->manager->default_std_error; + + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + + u->ignore_on_isolate = true; +} + +static void swap_unwatch_control_pid(Swap *s) { + assert(s); + + if (s->control_pid <= 0) + return; + + unit_unwatch_pid(UNIT(s), s->control_pid); + s->control_pid = 0; +} + +static void swap_done(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + swap_unset_proc_swaps(s); + swap_set_devnode(s, NULL); + + s->what = mfree(s->what); + s->parameters_fragment.what = mfree(s->parameters_fragment.what); + s->parameters_fragment.options = mfree(s->parameters_fragment.options); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, false); + exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); + s->control_command = NULL; + + dynamic_creds_unref(&s->dynamic_creds); + + swap_unwatch_control_pid(s); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); +} + +static int swap_arm_timer(Swap *s, usec_t usec) { + int r; + + assert(s); + + if (s->timer_event_source) { + r = sd_event_source_set_time(s->timer_event_source, usec); + if (r < 0) + return r; + + return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT); + } + + if (usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time( + UNIT(s)->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + usec, 0, + swap_dispatch_timer, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timer_event_source, "swap-timer"); + + return 0; +} + +static SwapParameters* swap_get_parameters(Swap *s) { + assert(s); + + if (s->from_proc_swaps) + return &s->parameters_proc_swaps; + + if (s->from_fragment) + return &s->parameters_fragment; + + return NULL; +} + +static int swap_add_device_dependencies(Swap *s) { + UnitDependencyMask mask; + SwapParameters *p; + int r; + + assert(s); + + if (!s->what) + return 0; + + p = swap_get_parameters(s); + if (!p || !p->what) + return 0; + + mask = s->from_proc_swaps ? UNIT_DEPENDENCY_PROC_SWAP : UNIT_DEPENDENCY_FILE; + + if (is_device_path(p->what)) { + r = unit_add_node_dependency(UNIT(s), p->what, UNIT_REQUIRES, mask); + if (r < 0) + return r; + + return unit_add_blockdev_dependency(UNIT(s), p->what, mask); + } + + /* File based swap devices need to be ordered after systemd-remount-fs.service, since they might need + * a writable file system. */ + return unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, mask); +} + +static int swap_add_default_dependencies(Swap *s) { + int r; + + assert(s); + + if (!UNIT(s)->default_dependencies) + return 0; + + if (!MANAGER_IS_SYSTEM(UNIT(s)->manager)) + return 0; + + if (detect_container() > 0) + return 0; + + /* swap units generated for the swap dev links are missing the + * ordering dep against the swap target. */ + r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SWAP_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int swap_verify(Swap *s) { + _cleanup_free_ char *e = NULL; + int r; + + assert(UNIT(s)->load_state == UNIT_LOADED); + + r = unit_name_from_path(s->what, ".swap", &e); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to generate unit name from path: %m"); + + if (!unit_has_name(UNIT(s), e)) { + log_unit_error(UNIT(s), "Value of What= and unit name do not match, not loading."); + return -ENOEXEC; + } + + if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP) { + log_unit_error(UNIT(s), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing to load."); + return -ENOEXEC; + } + + return 0; +} + +static int swap_load_devnode(Swap *s) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + struct stat st; + const char *p; + int r; + + assert(s); + + if (stat(s->what, &st) < 0 || !S_ISBLK(st.st_mode)) + return 0; + + r = device_new_from_stat_rdev(&d, &st); + if (r < 0) { + log_unit_full_errno(UNIT(s), r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to allocate device for swap %s: %m", s->what); + return 0; + } + + if (sd_device_get_devname(d, &p) < 0) + return 0; + + return swap_set_devnode(s, p); +} + +static int swap_add_extras(Swap *s) { + int r; + + assert(s); + + if (UNIT(s)->fragment_path) + s->from_fragment = true; + + if (!s->what) { + if (s->parameters_fragment.what) + s->what = strdup(s->parameters_fragment.what); + else if (s->parameters_proc_swaps.what) + s->what = strdup(s->parameters_proc_swaps.what); + else { + r = unit_name_to_path(UNIT(s)->id, &s->what); + if (r < 0) + return r; + } + + if (!s->what) + return -ENOMEM; + } + + path_simplify(s->what, false); + + if (!UNIT(s)->description) { + r = unit_set_description(UNIT(s), s->what); + if (r < 0) + return r; + } + + r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT); + if (r < 0) + return r; + + r = swap_add_device_dependencies(s); + if (r < 0) + return r; + + r = swap_load_devnode(s); + if (r < 0) + return r; + + r = unit_patch_contexts(UNIT(s)); + if (r < 0) + return r; + + r = unit_add_exec_dependencies(UNIT(s), &s->exec_context); + if (r < 0) + return r; + + r = unit_set_default_slice(UNIT(s)); + if (r < 0) + return r; + + r = swap_add_default_dependencies(s); + if (r < 0) + return r; + + return 0; +} + +static int swap_load(Unit *u) { + Swap *s = SWAP(u); + int r, q = 0; + + assert(s); + assert(u->load_state == UNIT_STUB); + + /* Load a .swap file */ + bool fragment_optional = s->from_proc_swaps; + r = unit_load_fragment_and_dropin(u, !fragment_optional); + + /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is + * already active. */ + if (u->load_state == UNIT_LOADED || s->from_proc_swaps) + q = swap_add_extras(s); + + if (r < 0) + return r; + if (q < 0) + return q; + if (u->load_state != UNIT_LOADED) + return 0; + + return swap_verify(s); +} + +static int swap_setup_unit( + Manager *m, + const char *what, + const char *what_proc_swaps, + int priority, + bool set_flags) { + + _cleanup_free_ char *e = NULL; + bool delete = false; + Unit *u = NULL; + int r; + SwapParameters *p; + + assert(m); + assert(what); + assert(what_proc_swaps); + + r = unit_name_from_path(what, ".swap", &e); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m"); + + u = manager_get_unit(m, e); + if (u && + SWAP(u)->from_proc_swaps && + !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "Swap %s appeared twice with different device paths %s and %s", + e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps); + + if (!u) { + delete = true; + + r = unit_new_for_name(m, sizeof(Swap), e, &u); + if (r < 0) + goto fail; + + SWAP(u)->what = strdup(what); + if (!SWAP(u)->what) { + r = -ENOMEM; + goto fail; + } + + unit_add_to_load_queue(u); + } else + delete = false; + + p = &SWAP(u)->parameters_proc_swaps; + + if (!p->what) { + p->what = strdup(what_proc_swaps); + if (!p->what) { + r = -ENOMEM; + goto fail; + } + } + + /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be + * loaded. After all we can load it now, from the data in /proc/swaps. */ + if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + u->load_state = UNIT_LOADED; + u->load_error = 0; + } + + if (set_flags) { + SWAP(u)->is_active = true; + SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps; + } + + SWAP(u)->from_proc_swaps = true; + + p->priority = priority; + p->priority_set = true; + + unit_add_to_dbus_queue(u); + return 0; + +fail: + log_unit_warning_errno(u, r, "Failed to load swap unit: %m"); + + if (delete) + unit_free(u); + + return r; +} + +static int swap_process_new(Manager *m, const char *device, int prio, bool set_flags) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + const char *dn, *devlink; + struct stat st, st_link; + int r; + + assert(m); + + r = swap_setup_unit(m, device, device, prio, set_flags); + if (r < 0) + return r; + + /* If this is a block device, then let's add duplicates for + * all other names of this block device */ + if (stat(device, &st) < 0 || !S_ISBLK(st.st_mode)) + return 0; + + r = device_new_from_stat_rdev(&d, &st); + if (r < 0) { + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "Failed to allocate device for swap %s: %m", device); + return 0; + } + + /* Add the main device node */ + if (sd_device_get_devname(d, &dn) >= 0 && !streq(dn, device)) + swap_setup_unit(m, dn, device, prio, set_flags); + + /* Add additional units for all symlinks */ + FOREACH_DEVICE_DEVLINK(d, devlink) { + + /* Don't bother with the /dev/block links */ + if (streq(devlink, device)) + continue; + + if (path_startswith(devlink, "/dev/block/")) + continue; + + if (stat(devlink, &st_link) >= 0 && + (!S_ISBLK(st_link.st_mode) || + st_link.st_rdev != st.st_rdev)) + continue; + + swap_setup_unit(m, devlink, device, prio, set_flags); + } + + return 0; +} + +static void swap_set_state(Swap *s, SwapState state) { + SwapState old_state; + Swap *other; + + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); + + old_state = s->state; + s->state = state; + + if (!SWAP_STATE_WITH_PROCESS(state)) { + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + swap_unwatch_control_pid(s); + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + } + + if (state != old_state) + log_unit_debug(UNIT(s), "Changed %s -> %s", swap_state_to_string(old_state), swap_state_to_string(state)); + + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0); + + /* If there other units for the same device node have a job + queued it might be worth checking again if it is runnable + now. This is necessary, since swap_start() refuses + operation with EAGAIN if there's already another job for + the same device node queued. */ + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job) + job_add_to_run_queue(UNIT(other)->job); +} + +static int swap_coldplug(Unit *u) { + Swap *s = SWAP(u); + SwapState new_state = SWAP_DEAD; + int r; + + assert(s); + assert(s->state == SWAP_DEAD); + + if (s->deserialized_state != s->state) + new_state = s->deserialized_state; + else if (s->from_proc_swaps) + new_state = SWAP_ACTIVE; + + if (new_state == s->state) + return 0; + + if (s->control_pid > 0 && + pid_is_unwaited(s->control_pid) && + SWAP_STATE_WITH_PROCESS(new_state)) { + + r = unit_watch_pid(UNIT(s), s->control_pid, false); + if (r < 0) + return r; + + r = swap_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec)); + if (r < 0) + return r; + } + + if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) { + (void) unit_setup_dynamic_creds(u); + (void) unit_setup_exec_runtime(u); + } + + swap_set_state(s, new_state); + return 0; +} + +static void swap_dump(Unit *u, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESPAN_MAX]; + Swap *s = SWAP(u); + SwapParameters *p; + + assert(s); + assert(f); + + if (s->from_proc_swaps) + p = &s->parameters_proc_swaps; + else if (s->from_fragment) + p = &s->parameters_fragment; + else + p = NULL; + + fprintf(f, + "%sSwap State: %s\n" + "%sResult: %s\n" + "%sClean Result: %s\n" + "%sWhat: %s\n" + "%sFrom /proc/swaps: %s\n" + "%sFrom fragment: %s\n" + "%sExtrinsic: %s\n", + prefix, swap_state_to_string(s->state), + prefix, swap_result_to_string(s->result), + prefix, swap_result_to_string(s->clean_result), + prefix, s->what, + prefix, yes_no(s->from_proc_swaps), + prefix, yes_no(s->from_fragment), + prefix, yes_no(swap_is_extrinsic(u))); + + if (s->devnode) + fprintf(f, "%sDevice Node: %s\n", prefix, s->devnode); + + if (p) + fprintf(f, + "%sPriority: %i\n" + "%sOptions: %s\n", + prefix, p->priority, + prefix, strempty(p->options)); + + fprintf(f, + "%sTimeoutSec: %s\n", + prefix, format_timespan(buf, sizeof(buf), s->timeout_usec, USEC_PER_SEC)); + + if (s->control_pid > 0) + fprintf(f, + "%sControl PID: "PID_FMT"\n", + prefix, s->control_pid); + + exec_context_dump(&s->exec_context, f, prefix); + kill_context_dump(&s->kill_context, f, prefix); + cgroup_context_dump(UNIT(s), f, prefix); +} + +static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { + + _cleanup_(exec_params_clear) ExecParameters exec_params = { + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, + }; + pid_t pid; + int r; + + assert(s); + assert(c); + assert(_pid); + + r = unit_prepare_exec(UNIT(s)); + if (r < 0) + return r; + + r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + r = unit_set_exec_params(UNIT(s), &exec_params); + if (r < 0) + goto fail; + + r = exec_spawn(UNIT(s), + c, + &s->exec_context, + &exec_params, + s->exec_runtime, + &s->dynamic_creds, + &pid); + if (r < 0) + goto fail; + + r = unit_watch_pid(UNIT(s), pid, true); + if (r < 0) + goto fail; + + *_pid = pid; + + return 0; + +fail: + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + return r; +} + +static void swap_enter_dead(Swap *s, SwapResult f) { + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + unit_log_result(UNIT(s), s->result == SWAP_SUCCESS, swap_result_to_string(s->result)); + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop); + swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD); + + s->exec_runtime = exec_runtime_unref(s->exec_runtime, true); + + unit_destroy_runtime_data(UNIT(s), &s->exec_context); + + unit_unref_uid_gid(UNIT(s), true); + + dynamic_creds_destroy(&s->dynamic_creds); +} + +static void swap_enter_active(Swap *s, SwapResult f) { + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + swap_set_state(s, SWAP_ACTIVE); +} + +static void swap_enter_dead_or_active(Swap *s, SwapResult f) { + assert(s); + + if (s->from_proc_swaps) { + Swap *other; + + swap_enter_active(s, f); + + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job) + swap_enter_dead_or_active(other, f); + } else + swap_enter_dead(s, f); +} + +static int state_to_kill_operation(Swap *s, SwapState state) { + if (state == SWAP_DEACTIVATING_SIGTERM) { + if (unit_has_job_type(UNIT(s), JOB_RESTART)) + return KILL_RESTART; + else + return KILL_TERMINATE; + } + + return KILL_KILL; +} + +static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) { + int r; + + assert(s); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + r = unit_kill_context(UNIT(s), + &s->kill_context, + state_to_kill_operation(s, state), + -1, + s->control_pid, + false); + if (r < 0) + goto fail; + + if (r > 0) { + r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec)); + if (r < 0) + goto fail; + + swap_set_state(s, state); + } else if (state == SWAP_DEACTIVATING_SIGTERM && s->kill_context.send_sigkill) + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS); + else + swap_enter_dead_or_active(s, SWAP_SUCCESS); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_enter_activating(Swap *s) { + _cleanup_free_ char *opts = NULL; + int r; + + assert(s); + + unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start); + + s->control_command_id = SWAP_EXEC_ACTIVATE; + s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE; + + if (s->from_fragment) { + int priority = 0; + + r = fstab_find_pri(s->parameters_fragment.options, &priority); + if (r < 0) + log_unit_warning_errno(UNIT(s), r, "Failed to parse swap priority \"%s\", ignoring: %m", s->parameters_fragment.options); + else if (r > 0 && s->parameters_fragment.priority_set) + log_unit_warning(UNIT(s), "Duplicate swap priority configuration by Priority= and Options= fields."); + + if (r <= 0 && s->parameters_fragment.priority_set) { + if (s->parameters_fragment.options) + r = asprintf(&opts, "%s,pri=%i", s->parameters_fragment.options, s->parameters_fragment.priority); + else + r = asprintf(&opts, "pri=%i", s->parameters_fragment.priority); + if (r < 0) { + r = -ENOMEM; + goto fail; + } + } + } + + r = exec_command_set(s->control_command, "/sbin/swapon", NULL); + if (r < 0) + goto fail; + + if (s->parameters_fragment.options || opts) { + r = exec_command_append(s->control_command, "-o", + opts ?: s->parameters_fragment.options, NULL); + if (r < 0) + goto fail; + } + + r = exec_command_append(s->control_command, s->what, NULL); + if (r < 0) + goto fail; + + swap_unwatch_control_pid(s); + + r = swap_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + swap_set_state(s, SWAP_ACTIVATING); + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'swapon' task: %m"); + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_enter_deactivating(Swap *s) { + int r; + + assert(s); + + s->control_command_id = SWAP_EXEC_DEACTIVATE; + s->control_command = s->exec_command + SWAP_EXEC_DEACTIVATE; + + r = exec_command_set(s->control_command, + "/sbin/swapoff", + s->what, + NULL); + if (r < 0) + goto fail; + + swap_unwatch_control_pid(s); + + r = swap_spawn(s, s->control_command, &s->control_pid); + if (r < 0) + goto fail; + + swap_set_state(s, SWAP_DEACTIVATING); + + return; + +fail: + log_unit_warning_errno(UNIT(s), r, "Failed to run 'swapoff' task: %m"); + swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES); +} + +static void swap_cycle_clear(Swap *s) { + assert(s); + + s->result = SWAP_SUCCESS; + exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); + UNIT(s)->reset_accounting = true; +} + +static int swap_start(Unit *u) { + Swap *s = SWAP(u), *other; + int r; + + assert(s); + + /* We cannot fulfill this request right now, try again later please! */ + if (IN_SET(s->state, + SWAP_DEACTIVATING, + SWAP_DEACTIVATING_SIGTERM, + SWAP_DEACTIVATING_SIGKILL, + SWAP_CLEANING)) + return -EAGAIN; + + /* Already on it! */ + if (s->state == SWAP_ACTIVATING) + return 0; + + assert(IN_SET(s->state, SWAP_DEAD, SWAP_FAILED)); + + if (detect_container() > 0) + return -EPERM; + + /* If there's a job for another swap unit for the same node + * running, then let's not dispatch this one for now, and wait + * until that other job has finished. */ + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (UNIT(other)->job && UNIT(other)->job->state == JOB_RUNNING) + return -EAGAIN; + + r = unit_test_start_limit(u); + if (r < 0) { + swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + swap_cycle_clear(s); + swap_enter_activating(s); + return 1; +} + +static int swap_stop(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + switch (s->state) { + + case SWAP_DEACTIVATING: + case SWAP_DEACTIVATING_SIGTERM: + case SWAP_DEACTIVATING_SIGKILL: + /* Already on it */ + return 0; + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + /* There's a control process pending, directly enter kill mode */ + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_SUCCESS); + return 0; + + case SWAP_ACTIVE: + if (detect_container() > 0) + return -EPERM; + + swap_enter_deactivating(s); + return 1; + + case SWAP_CLEANING: + /* If we are currently cleaning, then abort it, brutally. */ + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS); + return 0; + + default: + assert_not_reached("Unexpected state."); + } +} + +static int swap_serialize(Unit *u, FILE *f, FDSet *fds) { + Swap *s = SWAP(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", swap_state_to_string(s->state)); + (void) serialize_item(f, "result", swap_result_to_string(s->result)); + + if (s->control_pid > 0) + (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid); + + if (s->control_command_id >= 0) + (void) serialize_item(f, "control-command", swap_exec_command_to_string(s->control_command_id)); + + return 0; +} + +static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Swap *s = SWAP(u); + + assert(s); + assert(fds); + + if (streq(key, "state")) { + SwapState state; + + state = swap_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + s->deserialized_state = state; + } else if (streq(key, "result")) { + SwapResult f; + + f = swap_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != SWAP_SUCCESS) + s->result = f; + } else if (streq(key, "control-pid")) { + pid_t pid; + + if (parse_pid(value, &pid) < 0) + log_unit_debug(u, "Failed to parse control-pid value: %s", value); + else + s->control_pid = pid; + + } else if (streq(key, "control-command")) { + SwapExecCommand id; + + id = swap_exec_command_from_string(value); + if (id < 0) + log_unit_debug(u, "Failed to parse exec-command value: %s", value); + else { + s->control_command_id = id; + s->control_command = s->exec_command + id; + } + } else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Swap *s = SWAP(u); + SwapResult f; + + assert(s); + assert(pid >= 0); + + if (pid != s->control_pid) + return; + + /* Let's scan /proc/swaps before we process SIGCHLD. For the reasoning see the similar code in + * mount.c */ + (void) swap_process_proc_swaps(u->manager); + + s->control_pid = 0; + + if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) + f = SWAP_SUCCESS; + else if (code == CLD_EXITED) + f = SWAP_FAILURE_EXIT_CODE; + else if (code == CLD_KILLED) + f = SWAP_FAILURE_SIGNAL; + else if (code == CLD_DUMPED) + f = SWAP_FAILURE_CORE_DUMP; + else + assert_not_reached("Unknown code"); + + if (s->result == SWAP_SUCCESS) + s->result = f; + + if (s->control_command) { + exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status); + + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + } + + unit_log_process_exit( + u, + "Swap process", + swap_exec_command_to_string(s->control_command_id), + f == SWAP_SUCCESS, + code, status); + + switch (s->state) { + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + + if (f == SWAP_SUCCESS || s->from_proc_swaps) + swap_enter_active(s, f); + else + swap_enter_dead(s, f); + break; + + case SWAP_DEACTIVATING: + case SWAP_DEACTIVATING_SIGKILL: + case SWAP_DEACTIVATING_SIGTERM: + + swap_enter_dead_or_active(s, f); + break; + + case SWAP_CLEANING: + if (s->clean_result == SWAP_SUCCESS) + s->clean_result = f; + + swap_enter_dead(s, SWAP_SUCCESS); + break; + + default: + assert_not_reached("Uh, control process died at wrong time."); + } + + /* Notify clients about changed exit status */ + unit_add_to_dbus_queue(u); +} + +static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { + Swap *s = SWAP(userdata); + + assert(s); + assert(s->timer_event_source == source); + + switch (s->state) { + + case SWAP_ACTIVATING: + case SWAP_ACTIVATING_DONE: + log_unit_warning(UNIT(s), "Activation timed out. Stopping."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_DEACTIVATING: + log_unit_warning(UNIT(s), "Deactivation timed out. Stopping."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_DEACTIVATING_SIGTERM: + if (s->kill_context.send_sigkill) { + log_unit_warning(UNIT(s), "Swap process timed out. Killing."); + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_FAILURE_TIMEOUT); + } else { + log_unit_warning(UNIT(s), "Swap process timed out. Skipping SIGKILL. Ignoring."); + swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT); + } + break; + + case SWAP_DEACTIVATING_SIGKILL: + log_unit_warning(UNIT(s), "Swap process still around after SIGKILL. Ignoring."); + swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT); + break; + + case SWAP_CLEANING: + log_unit_warning(UNIT(s), "Cleaning timed out. killing."); + + if (s->clean_result == SWAP_SUCCESS) + s->clean_result = SWAP_FAILURE_TIMEOUT; + + swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, 0); + break; + + default: + assert_not_reached("Timeout at wrong time."); + } + + return 0; +} + +static int swap_load_proc_swaps(Manager *m, bool set_flags) { + unsigned i; + + assert(m); + + rewind(m->proc_swaps); + + (void) fscanf(m->proc_swaps, "%*s %*s %*s %*s %*s\n"); + + for (i = 1;; i++) { + _cleanup_free_ char *dev = NULL, *d = NULL; + int prio = 0, k; + + k = fscanf(m->proc_swaps, + "%ms " /* device/file */ + "%*s " /* type of swap */ + "%*s " /* swap size */ + "%*s " /* used */ + "%i\n", /* priority */ + &dev, &prio); + if (k != 2) { + if (k == EOF) + break; + + log_warning("Failed to parse /proc/swaps:%u.", i); + continue; + } + + if (cunescape(dev, UNESCAPE_RELAX, &d) < 0) + return log_oom(); + + device_found_node(m, d, DEVICE_FOUND_SWAP, DEVICE_FOUND_SWAP); + + (void) swap_process_new(m, d, prio, set_flags); + } + + return 0; +} + +static int swap_process_proc_swaps(Manager *m) { + Unit *u; + int r; + + assert(m); + + r = swap_load_proc_swaps(m, true); + if (r < 0) { + log_error_errno(r, "Failed to reread /proc/swaps: %m"); + + /* Reset flags, just in case, for late calls */ + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) { + Swap *swap = SWAP(u); + + swap->is_active = swap->just_activated = false; + } + + return 0; + } + + manager_dispatch_load_queue(m); + + LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) { + Swap *swap = SWAP(u); + + if (!swap->is_active) { + + swap_unset_proc_swaps(swap); + + switch (swap->state) { + + case SWAP_ACTIVE: + /* This has just been deactivated */ + swap_enter_dead(swap, SWAP_SUCCESS); + break; + + default: + /* Fire again */ + swap_set_state(swap, swap->state); + break; + } + + if (swap->what) + device_found_node(m, swap->what, 0, DEVICE_FOUND_SWAP); + + } else if (swap->just_activated) { + + /* New swap entry */ + + switch (swap->state) { + + case SWAP_DEAD: + case SWAP_FAILED: + (void) unit_acquire_invocation_id(u); + swap_cycle_clear(swap); + swap_enter_active(swap, SWAP_SUCCESS); + break; + + case SWAP_ACTIVATING: + swap_set_state(swap, SWAP_ACTIVATING_DONE); + break; + + default: + /* Nothing really changed, but let's + * issue an notification call + * nonetheless, in case somebody is + * waiting for this. */ + swap_set_state(swap, swap->state); + break; + } + } + + /* Reset the flags for later calls */ + swap->is_active = swap->just_activated = false; + } + + return 1; +} + +static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + + assert(m); + assert(revents & EPOLLPRI); + + return swap_process_proc_swaps(m); +} + +static Unit *swap_following(Unit *u) { + Swap *s = SWAP(u); + Swap *other, *first = NULL; + + assert(s); + + /* If the user configured the swap through /etc/fstab or + * a device unit, follow that. */ + + if (s->from_fragment) + return NULL; + + LIST_FOREACH_OTHERS(same_devnode, other, s) + if (other->from_fragment) + return UNIT(other); + + /* Otherwise, make everybody follow the unit that's named after + * the swap device in the kernel */ + + if (streq_ptr(s->what, s->devnode)) + return NULL; + + LIST_FOREACH_AFTER(same_devnode, other, s) + if (streq_ptr(other->what, other->devnode)) + return UNIT(other); + + LIST_FOREACH_BEFORE(same_devnode, other, s) { + if (streq_ptr(other->what, other->devnode)) + return UNIT(other); + + first = other; + } + + /* Fall back to the first on the list */ + return UNIT(first); +} + +static int swap_following_set(Unit *u, Set **_set) { + Swap *s = SWAP(u), *other; + _cleanup_set_free_ Set *set = NULL; + int r; + + assert(s); + assert(_set); + + if (LIST_JUST_US(same_devnode, s)) { + *_set = NULL; + return 0; + } + + set = set_new(NULL); + if (!set) + return -ENOMEM; + + LIST_FOREACH_OTHERS(same_devnode, other, s) { + r = set_put(set, other); + if (r < 0) + return r; + } + + *_set = TAKE_PTR(set); + return 1; +} + +static void swap_shutdown(Manager *m) { + assert(m); + + m->swap_event_source = sd_event_source_unref(m->swap_event_source); + m->proc_swaps = safe_fclose(m->proc_swaps); + m->swaps_by_devnode = hashmap_free(m->swaps_by_devnode); +} + +static void swap_enumerate(Manager *m) { + int r; + + assert(m); + + if (!m->proc_swaps) { + m->proc_swaps = fopen("/proc/swaps", "re"); + if (!m->proc_swaps) { + if (errno == ENOENT) + log_debug_errno(errno, "Not swap enabled, skipping enumeration."); + else + log_warning_errno(errno, "Failed to open /proc/swaps, ignoring: %m"); + + return; + } + + r = sd_event_add_io(m->event, &m->swap_event_source, fileno(m->proc_swaps), EPOLLPRI, swap_dispatch_io, m); + if (r < 0) { + log_error_errno(r, "Failed to watch /proc/swaps: %m"); + goto fail; + } + + /* Dispatch this before we dispatch SIGCHLD, so that + * we always get the events from /proc/swaps before + * the SIGCHLD of /sbin/swapon. */ + r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) { + log_error_errno(r, "Failed to change /proc/swaps priority: %m"); + goto fail; + } + + (void) sd_event_source_set_description(m->swap_event_source, "swap-proc"); + } + + r = swap_load_proc_swaps(m, false); + if (r < 0) + goto fail; + + return; + +fail: + swap_shutdown(m); +} + +int swap_process_device_new(Manager *m, sd_device *dev) { + _cleanup_free_ char *e = NULL; + const char *dn, *devlink; + Unit *u; + int r; + + assert(m); + assert(dev); + + r = sd_device_get_devname(dev, &dn); + if (r < 0) + return 0; + + r = unit_name_from_path(dn, ".swap", &e); + if (r < 0) + return r; + + u = manager_get_unit(m, e); + if (u) + r = swap_set_devnode(SWAP(u), dn); + + FOREACH_DEVICE_DEVLINK(dev, devlink) { + _cleanup_free_ char *n = NULL; + int q; + + q = unit_name_from_path(devlink, ".swap", &n); + if (q < 0) + return q; + + u = manager_get_unit(m, n); + if (u) { + q = swap_set_devnode(SWAP(u), dn); + if (q < 0) + r = q; + } + } + + return r; +} + +int swap_process_device_remove(Manager *m, sd_device *dev) { + const char *dn; + int r; + Swap *s; + + r = sd_device_get_devname(dev, &dn); + if (r < 0) + return 0; + + while ((s = hashmap_get(m->swaps_by_devnode, dn))) { + int q; + + q = swap_set_devnode(s, NULL); + if (q < 0) + r = q; + } + + return r; +} + +static void swap_reset_failed(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + if (s->state == SWAP_FAILED) + swap_set_state(s, SWAP_DEAD); + + s->result = SWAP_SUCCESS; + s->clean_result = SWAP_SUCCESS; +} + +static int swap_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) { + return unit_kill_common(u, who, signo, -1, SWAP(u)->control_pid, error); +} + +static int swap_get_timeout(Unit *u, usec_t *timeout) { + Swap *s = SWAP(u); + usec_t t; + int r; + + if (!s->timer_event_source) + return 0; + + r = sd_event_source_get_time(s->timer_event_source, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) + return 0; + + *timeout = t; + return 1; +} + +static bool swap_supported(void) { + static int supported = -1; + + /* If swap support is not available in the kernel, or we are + * running in a container we don't support swap units, and any + * attempts to starting one should fail immediately. */ + + if (supported < 0) + supported = + access("/proc/swaps", F_OK) >= 0 && + detect_container() <= 0; + + return supported; +} + +static int swap_control_pid(Unit *u) { + Swap *s = SWAP(u); + + assert(s); + + return s->control_pid; +} + +static int swap_clean(Unit *u, ExecCleanMask mask) { + _cleanup_strv_free_ char **l = NULL; + Swap *s = SWAP(u); + int r; + + assert(s); + assert(mask != 0); + + if (s->state != SWAP_DEAD) + return -EBUSY; + + r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l); + if (r < 0) + return r; + + if (strv_isempty(l)) + return -EUNATCH; + + swap_unwatch_control_pid(s); + s->clean_result = SWAP_SUCCESS; + s->control_command = NULL; + s->control_command_id = _SWAP_EXEC_COMMAND_INVALID; + + r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->exec_context.timeout_clean_usec)); + if (r < 0) + goto fail; + + r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid); + if (r < 0) + goto fail; + + swap_set_state(s, SWAP_CLEANING); + + return 0; + +fail: + log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m"); + s->clean_result = SWAP_FAILURE_RESOURCES; + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + return r; +} + +static int swap_can_clean(Unit *u, ExecCleanMask *ret) { + Swap *s = SWAP(u); + + assert(s); + + return exec_context_get_clean_mask(&s->exec_context, ret); +} + +static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = { + [SWAP_EXEC_ACTIVATE] = "ExecActivate", + [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate", +}; + +DEFINE_STRING_TABLE_LOOKUP(swap_exec_command, SwapExecCommand); + +static const char* const swap_result_table[_SWAP_RESULT_MAX] = { + [SWAP_SUCCESS] = "success", + [SWAP_FAILURE_RESOURCES] = "resources", + [SWAP_FAILURE_TIMEOUT] = "timeout", + [SWAP_FAILURE_EXIT_CODE] = "exit-code", + [SWAP_FAILURE_SIGNAL] = "signal", + [SWAP_FAILURE_CORE_DUMP] = "core-dump", + [SWAP_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(swap_result, SwapResult); + +const UnitVTable swap_vtable = { + .object_size = sizeof(Swap), + .exec_context_offset = offsetof(Swap, exec_context), + .cgroup_context_offset = offsetof(Swap, cgroup_context), + .kill_context_offset = offsetof(Swap, kill_context), + .exec_runtime_offset = offsetof(Swap, exec_runtime), + .dynamic_creds_offset = offsetof(Swap, dynamic_creds), + + .sections = + "Unit\0" + "Swap\0" + "Install\0", + .private_section = "Swap", + + .can_fail = true, + + .init = swap_init, + .load = swap_load, + .done = swap_done, + + .coldplug = swap_coldplug, + + .dump = swap_dump, + + .start = swap_start, + .stop = swap_stop, + + .kill = swap_kill, + .clean = swap_clean, + .can_clean = swap_can_clean, + + .get_timeout = swap_get_timeout, + + .serialize = swap_serialize, + .deserialize_item = swap_deserialize_item, + + .active_state = swap_active_state, + .sub_state_to_string = swap_sub_state_to_string, + + .will_restart = unit_will_restart_default, + + .may_gc = swap_may_gc, + .is_extrinsic = swap_is_extrinsic, + + .sigchld_event = swap_sigchld_event, + + .reset_failed = swap_reset_failed, + + .control_pid = swap_control_pid, + + .bus_set_property = bus_swap_set_property, + .bus_commit_properties = bus_swap_commit_properties, + + .following = swap_following, + .following_set = swap_following_set, + + .enumerate = swap_enumerate, + .shutdown = swap_shutdown, + .supported = swap_supported, + + .status_message_formats = { + .starting_stopping = { + [0] = "Activating swap %s...", + [1] = "Deactivating swap %s...", + }, + .finished_start_job = { + [JOB_DONE] = "Activated swap %s.", + [JOB_FAILED] = "Failed to activate swap %s.", + [JOB_TIMEOUT] = "Timed out activating swap %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Deactivated swap %s.", + [JOB_FAILED] = "Failed deactivating swap %s.", + [JOB_TIMEOUT] = "Timed out deactivating swap %s.", + }, + }, +}; diff --git a/src/core/swap.h b/src/core/swap.h new file mode 100644 index 0000000..6ce9bfd --- /dev/null +++ b/src/core/swap.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 Maarten Lankhorst +***/ + +#include "sd-device.h" +#include "unit.h" + +typedef struct Swap Swap; + +typedef enum SwapExecCommand { + SWAP_EXEC_ACTIVATE, + SWAP_EXEC_DEACTIVATE, + _SWAP_EXEC_COMMAND_MAX, + _SWAP_EXEC_COMMAND_INVALID = -1 +} SwapExecCommand; + +typedef enum SwapResult { + SWAP_SUCCESS, + SWAP_FAILURE_RESOURCES, + SWAP_FAILURE_TIMEOUT, + SWAP_FAILURE_EXIT_CODE, + SWAP_FAILURE_SIGNAL, + SWAP_FAILURE_CORE_DUMP, + SWAP_FAILURE_START_LIMIT_HIT, + _SWAP_RESULT_MAX, + _SWAP_RESULT_INVALID = -1 +} SwapResult; + +typedef struct SwapParameters { + char *what; + char *options; + int priority; + bool priority_set; +} SwapParameters; + +struct Swap { + Unit meta; + + char *what; + + /* If the device has already shown up, this is the device + * node, which might be different from what, due to + * symlinks */ + char *devnode; + + SwapParameters parameters_proc_swaps; + SwapParameters parameters_fragment; + + bool from_proc_swaps:1; + bool from_fragment:1; + + /* Used while looking for swaps that vanished or got added + * from/to /proc/swaps */ + bool is_active:1; + bool just_activated:1; + + SwapResult result; + SwapResult clean_result; + + usec_t timeout_usec; + + ExecCommand exec_command[_SWAP_EXEC_COMMAND_MAX]; + ExecContext exec_context; + KillContext kill_context; + CGroupContext cgroup_context; + + ExecRuntime *exec_runtime; + DynamicCreds dynamic_creds; + + SwapState state, deserialized_state; + + ExecCommand* control_command; + SwapExecCommand control_command_id; + pid_t control_pid; + + sd_event_source *timer_event_source; + + /* In order to be able to distinguish dependencies on + different device nodes we might end up creating multiple + devices for the same swap. We chain them up here. */ + + LIST_FIELDS(struct Swap, same_devnode); +}; + +extern const UnitVTable swap_vtable; + +int swap_process_device_new(Manager *m, sd_device *dev); +int swap_process_device_remove(Manager *m, sd_device *dev); + +const char* swap_exec_command_to_string(SwapExecCommand i) _const_; +SwapExecCommand swap_exec_command_from_string(const char *s) _pure_; + +const char* swap_result_to_string(SwapResult i) _const_; +SwapResult swap_result_from_string(const char *s) _pure_; + +DEFINE_CAST(SWAP, Swap); diff --git a/src/core/system.conf.in b/src/core/system.conf.in new file mode 100644 index 0000000..40bb548 --- /dev/null +++ b/src/core/system.conf.in @@ -0,0 +1,71 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# Entries in this file show the compile time defaults. +# You can change settings by editing this file. +# Defaults can be restored by simply deleting this file. +# +# See systemd-system.conf(5) for details. + +[Manager] +#LogLevel=info +#LogTarget=journal-or-kmsg +#LogColor=yes +#LogLocation=no +#LogTime=no +#DumpCore=yes +#ShowStatus=yes +#CrashChangeVT=no +#CrashShell=no +#CrashReboot=no +#CtrlAltDelBurstAction=reboot-force +#CPUAffinity=1 2 +#NUMAPolicy=default +#NUMAMask= +#RuntimeWatchdogSec=0 +#RebootWatchdogSec=10min +#ShutdownWatchdogSec=10min +#KExecWatchdogSec=0 +#WatchdogDevice= +#CapabilityBoundingSet= +#NoNewPrivileges=no +#SystemCallArchitectures= +#TimerSlackNSec= +#StatusUnitFormat=@STATUS_UNIT_FORMAT_DEFAULT@ +#DefaultTimerAccuracySec=1min +#DefaultStandardOutput=journal +#DefaultStandardError=inherit +#DefaultTimeoutStartSec=90s +#DefaultTimeoutStopSec=90s +#DefaultTimeoutAbortSec= +#DefaultRestartSec=100ms +#DefaultStartLimitIntervalSec=10s +#DefaultStartLimitBurst=5 +#DefaultEnvironment= +#DefaultCPUAccounting=no +#DefaultIOAccounting=no +#DefaultIPAccounting=no +#DefaultBlockIOAccounting=no +#DefaultMemoryAccounting=@MEMORY_ACCOUNTING_DEFAULT@ +#DefaultTasksAccounting=yes +#DefaultTasksMax=15% +#DefaultLimitCPU= +#DefaultLimitFSIZE= +#DefaultLimitDATA= +#DefaultLimitSTACK= +#DefaultLimitCORE= +#DefaultLimitRSS= +#DefaultLimitNOFILE=1024:@HIGH_RLIMIT_NOFILE@ +#DefaultLimitAS= +#DefaultLimitNPROC= +#DefaultLimitMEMLOCK= +#DefaultLimitLOCKS= +#DefaultLimitSIGPENDING= +#DefaultLimitMSGQUEUE= +#DefaultLimitNICE= +#DefaultLimitRTPRIO= +#DefaultLimitRTTIME= diff --git a/src/core/systemd.pc.in b/src/core/systemd.pc.in new file mode 100644 index 0000000..f2c0455 --- /dev/null +++ b/src/core/systemd.pc.in @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +# Names with prefixes are preferred, and the run-together names should be +# considered deprecated (though there is no plan to remove them). New names +# shall have underscores. + +prefix=/usr +root_prefix=@rootprefix_noslash@ +rootprefix=${root_prefix} +sysconf_dir=@sysconfdir@ +sysconfdir=${sysconf_dir} + +systemd_util_dir=${root_prefix}/lib/systemd +systemdutildir=${systemd_util_dir} + +systemd_system_unit_dir=${rootprefix}/lib/systemd/system +systemdsystemunitdir=${systemd_system_unit_dir} + +systemd_system_preset_dir=${rootprefix}/lib/systemd/system-preset +systemdsystempresetdir=${systemd_system_preset_dir} + +systemd_user_unit_dir=/usr/lib/systemd/user +systemduserunitdir=${systemd_user_unit_dir} + +systemd_user_preset_dir=/usr/lib/systemd/user-preset +systemduserpresetdir=${systemd_user_preset_dir} + +systemd_system_conf_dir=${sysconfdir}/systemd/system +systemdsystemconfdir=${systemd_system_conf_dir} + +systemd_user_conf_dir=${sysconfdir}/systemd/user +systemduserconfdir=${systemd_user_conf_dir} + +systemd_system_unit_path=${systemd_system_conf_dir}:/etc/systemd/system:/run/systemd/system:/usr/local/lib/systemd/system:${systemd_system_unit_dir}:/usr/lib/systemd/system:/lib/systemd/system +systemdsystemunitpath=${systemd_system_unit_path} + +systemd_user_unit_path=${systemd_user_conf_dir}:/etc/systemd/user:/run/systemd/user:/usr/local/lib/systemd/user:/usr/local/share/systemd/user:${systemd_user_unit_dir}:/usr/lib/systemd/user:/usr/share/systemd/user +systemduserunitpath=${systemd_user_unit_path} + +systemd_system_generator_dir=${root_prefix}/lib/systemd/system-generators +systemdsystemgeneratordir=${systemd_system_generator_dir} + +systemd_user_generator_dir=/usr/lib/systemd/user-generators +systemdusergeneratordir=${systemd_user_generator_dir} + +systemd_system_generator_path=/run/systemd/system-generators:/etc/systemd/system-generators:/usr/local/lib/systemd/system-generators:${systemd_system_generator_dir} +systemdsystemgeneratorpath=${systemd_system_generator_path} + +systemd_user_generator_path=/run/systemd/user-generators:/etc/systemd/user-generators:/usr/local/lib/systemd/user-generators:${systemd_user_generator_dir} +systemdusergeneratorpath=${systemd_user_generator_path} + +systemd_sleep_dir=${root_prefix}/lib/systemd/system-sleep +systemdsleepdir=${systemd_sleep_dir} + +systemd_shutdown_dir=${root_prefix}/lib/systemd/system-shutdown +systemdshutdowndir=${systemd_shutdown_dir} + +tmpfiles_dir=/usr/lib/tmpfiles.d +tmpfilesdir=${tmpfiles_dir} + +sysusers_dir=${rootprefix}/lib/sysusers.d +sysusersdir=${sysusers_dir} + +sysctl_dir=${rootprefix}/lib/sysctl.d +sysctldir=${sysctl_dir} + +binfmt_dir=${rootprefix}/lib/binfmt.d +binfmtdir=${binfmt_dir} + +modules_load_dir=${rootprefix}/lib/modules-load.d +modulesloaddir=${modules_load_dir} + +catalog_dir=/usr/lib/systemd/catalog +catalogdir=${catalog_dir} + +system_uid_max=@SYSTEM_UID_MAX@ +systemuidmax=${system_uid_max} +system_gid_max=@SYSTEM_GID_MAX@ +systemgidmax=${system_gid_max} + +dynamic_uid_min=@dynamicuidmin@ +dynamicuidmin=${dynamic_uid_min} +dynamic_uid_max=@dynamicuidmax@ +dynamicuidmax=${dynamic_uid_max} + +container_uid_base_min=@containeruidbasemin@ +containeruidbasemin=${container_uid_base_min} +container_uid_base_max=@containeruidbasemax@ +containeruidbasemax=${container_uid_base_max} + +Name: systemd +Description: systemd System and Service Manager +URL: @PROJECT_URL@ +Version: @PROJECT_VERSION@ diff --git a/src/core/target.c b/src/core/target.c new file mode 100644 index 0000000..a422056 --- /dev/null +++ b/src/core/target.c @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dbus-target.h" +#include "dbus-unit.h" +#include "log.h" +#include "serialize.h" +#include "special.h" +#include "string-util.h" +#include "target.h" +#include "unit-name.h" +#include "unit.h" + +static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = { + [TARGET_DEAD] = UNIT_INACTIVE, + [TARGET_ACTIVE] = UNIT_ACTIVE +}; + +static void target_set_state(Target *t, TargetState state) { + TargetState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != old_state) + log_debug("%s changed %s -> %s", + UNIT(t)->id, + target_state_to_string(old_state), + target_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0); +} + +static int target_add_default_dependencies(Target *t) { + + static const UnitDependency deps[] = { + UNIT_REQUIRES, + UNIT_REQUISITE, + UNIT_WANTS, + UNIT_BINDS_TO, + UNIT_PART_OF + }; + + int r; + unsigned k; + + assert(t); + + if (!UNIT(t)->default_dependencies) + return 0; + + /* Imply ordering for requirement dependencies on target units. Note that when the user created a contradicting + * ordering manually we won't add anything in here to make sure we don't create a loop. */ + + for (k = 0; k < ELEMENTSOF(deps); k++) { + Unit *other; + void *v; + + HASHMAP_FOREACH_KEY(v, other, UNIT(t)->dependencies[deps[k]]) { + r = unit_add_default_target_dependency(other, UNIT(t)); + if (r < 0) + return r; + } + } + + if (unit_has_name(UNIT(t), SPECIAL_SHUTDOWN_TARGET)) + return 0; + + /* Make sure targets are unloaded on shutdown */ + return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int target_load(Unit *u) { + Target *t = TARGET(u); + int r; + + assert(t); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + return target_add_default_dependencies(t); +} + +static int target_coldplug(Unit *u) { + Target *t = TARGET(u); + + assert(t); + assert(t->state == TARGET_DEAD); + + if (t->deserialized_state != t->state) + target_set_state(t, t->deserialized_state); + + return 0; +} + +static void target_dump(Unit *u, FILE *f, const char *prefix) { + Target *t = TARGET(u); + + assert(t); + assert(f); + + fprintf(f, + "%sTarget State: %s\n", + prefix, target_state_to_string(t->state)); +} + +static int target_start(Unit *u) { + Target *t = TARGET(u); + int r; + + assert(t); + assert(t->state == TARGET_DEAD); + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + target_set_state(t, TARGET_ACTIVE); + return 1; +} + +static int target_stop(Unit *u) { + Target *t = TARGET(u); + + assert(t); + assert(t->state == TARGET_ACTIVE); + + target_set_state(t, TARGET_DEAD); + return 1; +} + +static int target_serialize(Unit *u, FILE *f, FDSet *fds) { + Target *s = TARGET(u); + + assert(s); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", target_state_to_string(s->state)); + return 0; +} + +static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Target *s = TARGET(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + TargetState state; + + state = target_state_from_string(value); + if (state < 0) + log_debug("Failed to parse state value %s", value); + else + s->deserialized_state = state; + + } else + log_debug("Unknown serialization key '%s'", key); + + return 0; +} + +_pure_ static UnitActiveState target_active_state(Unit *u) { + assert(u); + + return state_translation_table[TARGET(u)->state]; +} + +_pure_ static const char *target_sub_state_to_string(Unit *u) { + assert(u); + + return target_state_to_string(TARGET(u)->state); +} + +const UnitVTable target_vtable = { + .object_size = sizeof(Target), + + .sections = + "Unit\0" + "Target\0" + "Install\0", + + .can_fail = true, + + .load = target_load, + .coldplug = target_coldplug, + + .dump = target_dump, + + .start = target_start, + .stop = target_stop, + + .serialize = target_serialize, + .deserialize_item = target_deserialize_item, + + .active_state = target_active_state, + .sub_state_to_string = target_sub_state_to_string, + + .status_message_formats = { + .finished_start_job = { + [JOB_DONE] = "Reached target %s.", + }, + .finished_stop_job = { + [JOB_DONE] = "Stopped target %s.", + }, + }, +}; diff --git a/src/core/target.h b/src/core/target.h new file mode 100644 index 0000000..bb909d6 --- /dev/null +++ b/src/core/target.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +typedef struct Target Target; + +struct Target { + Unit meta; + + TargetState state, deserialized_state; +}; + +extern const UnitVTable target_vtable; + +DEFINE_CAST(TARGET, Target); diff --git a/src/core/timer.c b/src/core/timer.c new file mode 100644 index 0000000..651f18b --- /dev/null +++ b/src/core/timer.c @@ -0,0 +1,963 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <errno.h> + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "dbus-timer.h" +#include "dbus-unit.h" +#include "fs-util.h" +#include "parse-util.h" +#include "random-util.h" +#include "serialize.h" +#include "special.h" +#include "string-table.h" +#include "string-util.h" +#include "timer.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" + +static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = { + [TIMER_DEAD] = UNIT_INACTIVE, + [TIMER_WAITING] = UNIT_ACTIVE, + [TIMER_RUNNING] = UNIT_ACTIVE, + [TIMER_ELAPSED] = UNIT_ACTIVE, + [TIMER_FAILED] = UNIT_FAILED +}; + +static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata); + +static void timer_init(Unit *u) { + Timer *t = TIMER(u); + + assert(u); + assert(u->load_state == UNIT_STUB); + + t->next_elapse_monotonic_or_boottime = USEC_INFINITY; + t->next_elapse_realtime = USEC_INFINITY; + t->accuracy_usec = u->manager->default_timer_accuracy_usec; + t->remain_after_elapse = true; +} + +void timer_free_values(Timer *t) { + TimerValue *v; + + assert(t); + + while ((v = t->values)) { + LIST_REMOVE(value, t->values, v); + calendar_spec_free(v->calendar_spec); + free(v); + } +} + +static void timer_done(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + + timer_free_values(t); + + t->monotonic_event_source = sd_event_source_unref(t->monotonic_event_source); + t->realtime_event_source = sd_event_source_unref(t->realtime_event_source); + + free(t->stamp_path); +} + +static int timer_verify(Timer *t) { + assert(t); + assert(UNIT(t)->load_state == UNIT_LOADED); + + if (!t->values && !t->on_clock_change && !t->on_timezone_change) { + log_unit_error(UNIT(t), "Timer unit lacks value setting. Refusing."); + return -ENOEXEC; + } + + return 0; +} + +static int timer_add_default_dependencies(Timer *t) { + int r; + TimerValue *v; + + assert(t); + + if (!UNIT(t)->default_dependencies) + return 0; + + r = unit_add_dependency_by_name(UNIT(t), UNIT_BEFORE, SPECIAL_TIMERS_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { + r = unit_add_two_dependencies_by_name(UNIT(t), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + + LIST_FOREACH(value, v, t->values) { + if (v->base == TIMER_CALENDAR) { + r = unit_add_dependency_by_name(UNIT(t), UNIT_AFTER, SPECIAL_TIME_SYNC_TARGET, true, UNIT_DEPENDENCY_DEFAULT); + if (r < 0) + return r; + break; + } + } + } + + return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int timer_add_trigger_dependencies(Timer *t) { + Unit *x; + int r; + + assert(t); + + if (!hashmap_isempty(UNIT(t)->dependencies[UNIT_TRIGGERS])) + return 0; + + r = unit_load_related_unit(UNIT(t), ".service", &x); + if (r < 0) + return r; + + return unit_add_two_dependencies(UNIT(t), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT); +} + +static int timer_setup_persistent(Timer *t) { + int r; + + assert(t); + + if (!t->persistent) + return 0; + + if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { + + r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + t->stamp_path = strjoin("/var/lib/systemd/timers/stamp-", UNIT(t)->id); + } else { + const char *e; + + e = getenv("XDG_DATA_HOME"); + if (e) + t->stamp_path = strjoin(e, "/systemd/timers/stamp-", UNIT(t)->id); + else { + + _cleanup_free_ char *h = NULL; + + r = get_home_dir(&h); + if (r < 0) + return log_unit_error_errno(UNIT(t), r, "Failed to determine home directory: %m"); + + t->stamp_path = strjoin(h, "/.local/share/systemd/timers/stamp-", UNIT(t)->id); + } + } + + if (!t->stamp_path) + return log_oom(); + + return 0; +} + +static uint64_t timer_get_fixed_delay_hash(Timer *t) { + static const uint8_t hash_key[] = { + 0x51, 0x0a, 0xdb, 0x76, 0x29, 0x51, 0x42, 0xc2, + 0x80, 0x35, 0xea, 0xe6, 0x8e, 0x3a, 0x37, 0xbd + }; + + struct siphash state; + sd_id128_t machine_id; + uid_t uid; + int r; + + assert(t); + + uid = getuid(); + r = sd_id128_get_machine(&machine_id); + if (r < 0) { + log_unit_debug_errno(UNIT(t), r, + "Failed to get machine ID for the fixed delay calculation, proceeding with 0: %m"); + machine_id = SD_ID128_NULL; + } + + siphash24_init(&state, hash_key); + siphash24_compress(&machine_id, sizeof(sd_id128_t), &state); + siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state); + siphash24_compress(&uid, sizeof(uid_t), &state); + siphash24_compress_string(UNIT(t)->id, &state); + + return siphash24_finalize(&state); +} + +static int timer_load(Unit *u) { + Timer *t = TIMER(u); + int r; + + assert(u); + assert(u->load_state == UNIT_STUB); + + r = unit_load_fragment_and_dropin(u, true); + if (r < 0) + return r; + + if (u->load_state != UNIT_LOADED) + return 0; + + /* This is a new unit? Then let's add in some extras */ + r = timer_add_trigger_dependencies(t); + if (r < 0) + return r; + + r = timer_setup_persistent(t); + if (r < 0) + return r; + + r = timer_add_default_dependencies(t); + if (r < 0) + return r; + + return timer_verify(t); +} + +static void timer_dump(Unit *u, FILE *f, const char *prefix) { + char buf[FORMAT_TIMESPAN_MAX]; + Timer *t = TIMER(u); + Unit *trigger; + TimerValue *v; + + trigger = UNIT_TRIGGER(u); + + fprintf(f, + "%sTimer State: %s\n" + "%sResult: %s\n" + "%sUnit: %s\n" + "%sPersistent: %s\n" + "%sWakeSystem: %s\n" + "%sAccuracy: %s\n" + "%sRemainAfterElapse: %s\n" + "%sFixedRandomDelay: %s\n" + "%sOnClockChange: %s\n" + "%sOnTimeZoneChange: %s\n", + prefix, timer_state_to_string(t->state), + prefix, timer_result_to_string(t->result), + prefix, trigger ? trigger->id : "n/a", + prefix, yes_no(t->persistent), + prefix, yes_no(t->wake_system), + prefix, format_timespan(buf, sizeof(buf), t->accuracy_usec, 1), + prefix, yes_no(t->remain_after_elapse), + prefix, yes_no(t->fixed_random_delay), + prefix, yes_no(t->on_clock_change), + prefix, yes_no(t->on_timezone_change)); + + LIST_FOREACH(value, v, t->values) { + + if (v->base == TIMER_CALENDAR) { + _cleanup_free_ char *p = NULL; + + (void) calendar_spec_to_string(v->calendar_spec, &p); + + fprintf(f, + "%s%s: %s\n", + prefix, + timer_base_to_string(v->base), + strna(p)); + } else { + char timespan1[FORMAT_TIMESPAN_MAX]; + + fprintf(f, + "%s%s: %s\n", + prefix, + timer_base_to_string(v->base), + format_timespan(timespan1, sizeof(timespan1), v->value, 0)); + } + } +} + +static void timer_set_state(Timer *t, TimerState state) { + TimerState old_state; + assert(t); + + if (t->state != state) + bus_unit_send_pending_change_signal(UNIT(t), false); + + old_state = t->state; + t->state = state; + + if (state != TIMER_WAITING) { + t->monotonic_event_source = sd_event_source_unref(t->monotonic_event_source); + t->realtime_event_source = sd_event_source_unref(t->realtime_event_source); + t->next_elapse_monotonic_or_boottime = USEC_INFINITY; + t->next_elapse_realtime = USEC_INFINITY; + } + + if (state != old_state) + log_unit_debug(UNIT(t), "Changed %s -> %s", timer_state_to_string(old_state), timer_state_to_string(state)); + + unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0); +} + +static void timer_enter_waiting(Timer *t, bool time_change); + +static int timer_coldplug(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + assert(t->state == TIMER_DEAD); + + if (t->deserialized_state == t->state) + return 0; + + if (t->deserialized_state == TIMER_WAITING) + timer_enter_waiting(t, false); + else + timer_set_state(t, t->deserialized_state); + + return 0; +} + +static void timer_enter_dead(Timer *t, TimerResult f) { + assert(t); + + if (t->result == TIMER_SUCCESS) + t->result = f; + + unit_log_result(UNIT(t), t->result == TIMER_SUCCESS, timer_result_to_string(t->result)); + timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD); +} + +static void timer_enter_elapsed(Timer *t, bool leave_around) { + assert(t); + + /* If a unit is marked with RemainAfterElapse=yes we leave it + * around even after it elapsed once, so that starting it + * later again does not necessarily mean immediate + * retriggering. We unconditionally leave units with + * TIMER_UNIT_ACTIVE or TIMER_UNIT_INACTIVE triggers around, + * since they might be restarted automatically at any time + * later on. */ + + if (t->remain_after_elapse || leave_around) + timer_set_state(t, TIMER_ELAPSED); + else + timer_enter_dead(t, TIMER_SUCCESS); +} + +static void add_random(Timer *t, usec_t *v) { + char s[FORMAT_TIMESPAN_MAX]; + usec_t add; + + assert(t); + assert(v); + + if (t->random_usec == 0) + return; + if (*v == USEC_INFINITY) + return; + + add = (t->fixed_random_delay ? timer_get_fixed_delay_hash(t) : random_u64()) % t->random_usec; + + if (*v + add < *v) /* overflow */ + *v = (usec_t) -2; /* Highest possible value, that is not USEC_INFINITY */ + else + *v += add; + + log_unit_debug(UNIT(t), "Adding %s random time.", format_timespan(s, sizeof(s), add, 0)); +} + +static void timer_enter_waiting(Timer *t, bool time_change) { + bool found_monotonic = false, found_realtime = false; + bool leave_around = false; + triple_timestamp ts; + TimerValue *v; + Unit *trigger; + int r; + + assert(t); + + trigger = UNIT_TRIGGER(UNIT(t)); + if (!trigger) { + log_unit_error(UNIT(t), "Unit to trigger vanished."); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); + return; + } + + triple_timestamp_get(&ts); + t->next_elapse_monotonic_or_boottime = t->next_elapse_realtime = 0; + + LIST_FOREACH(value, v, t->values) { + if (v->disabled) + continue; + + if (v->base == TIMER_CALENDAR) { + usec_t b, rebased; + + /* If we know the last time this was + * triggered, schedule the job based relative + * to that. If we don't, just start from + * the activation time. */ + + if (t->last_trigger.realtime > 0) + b = t->last_trigger.realtime; + else { + if (state_translation_table[t->state] == UNIT_ACTIVE) + b = UNIT(t)->inactive_exit_timestamp.realtime; + else + b = ts.realtime; + } + + r = calendar_spec_next_usec(v->calendar_spec, b, &v->next_elapse); + if (r < 0) + continue; + + /* To make the delay due to RandomizedDelaySec= work even at boot, if the scheduled + * time has already passed, set the time when systemd first started as the scheduled + * time. Note that we base this on the monotonic timestamp of the boot, not the + * realtime one, since the wallclock might have been off during boot. */ + rebased = map_clock_usec(UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic, + CLOCK_MONOTONIC, CLOCK_REALTIME); + if (v->next_elapse < rebased) + v->next_elapse = rebased; + + if (!found_realtime) + t->next_elapse_realtime = v->next_elapse; + else + t->next_elapse_realtime = MIN(t->next_elapse_realtime, v->next_elapse); + + found_realtime = true; + + } else { + usec_t base; + + switch (v->base) { + + case TIMER_ACTIVE: + if (state_translation_table[t->state] == UNIT_ACTIVE) + base = UNIT(t)->inactive_exit_timestamp.monotonic; + else + base = ts.monotonic; + break; + + case TIMER_BOOT: + if (detect_container() <= 0) { + /* CLOCK_MONOTONIC equals the uptime on Linux */ + base = 0; + break; + } + /* In a container we don't want to include the time the host + * was already up when the container started, so count from + * our own startup. */ + _fallthrough_; + case TIMER_STARTUP: + base = UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic; + break; + + case TIMER_UNIT_ACTIVE: + leave_around = true; + base = MAX(trigger->inactive_exit_timestamp.monotonic, t->last_trigger.monotonic); + if (base <= 0) + continue; + break; + + case TIMER_UNIT_INACTIVE: + leave_around = true; + base = MAX(trigger->inactive_enter_timestamp.monotonic, t->last_trigger.monotonic); + if (base <= 0) + continue; + break; + + default: + assert_not_reached("Unknown timer base"); + } + + v->next_elapse = usec_add(usec_shift_clock(base, CLOCK_MONOTONIC, TIMER_MONOTONIC_CLOCK(t)), v->value); + + if (dual_timestamp_is_set(&t->last_trigger) && + !time_change && + v->next_elapse < triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)) && + IN_SET(v->base, TIMER_ACTIVE, TIMER_BOOT, TIMER_STARTUP)) { + /* This is a one time trigger, disable it now */ + v->disabled = true; + continue; + } + + if (!found_monotonic) + t->next_elapse_monotonic_or_boottime = v->next_elapse; + else + t->next_elapse_monotonic_or_boottime = MIN(t->next_elapse_monotonic_or_boottime, v->next_elapse); + + found_monotonic = true; + } + } + + if (!found_monotonic && !found_realtime && !t->on_timezone_change && !t->on_clock_change) { + log_unit_debug(UNIT(t), "Timer is elapsed."); + timer_enter_elapsed(t, leave_around); + return; + } + + if (found_monotonic) { + char buf[FORMAT_TIMESPAN_MAX]; + usec_t left; + + add_random(t, &t->next_elapse_monotonic_or_boottime); + + left = usec_sub_unsigned(t->next_elapse_monotonic_or_boottime, triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t))); + log_unit_debug(UNIT(t), "Monotonic timer elapses in %s.", format_timespan(buf, sizeof(buf), left, 0)); + + if (t->monotonic_event_source) { + r = sd_event_source_set_time(t->monotonic_event_source, t->next_elapse_monotonic_or_boottime); + if (r < 0) + goto fail; + + r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_ONESHOT); + if (r < 0) + goto fail; + } else { + + r = sd_event_add_time( + UNIT(t)->manager->event, + &t->monotonic_event_source, + t->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC, + t->next_elapse_monotonic_or_boottime, t->accuracy_usec, + timer_dispatch, t); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(t->monotonic_event_source, "timer-monotonic"); + } + + } else if (t->monotonic_event_source) { + + r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_OFF); + if (r < 0) + goto fail; + } + + if (found_realtime) { + char buf[FORMAT_TIMESTAMP_MAX]; + + add_random(t, &t->next_elapse_realtime); + + log_unit_debug(UNIT(t), "Realtime timer elapses at %s.", format_timestamp(buf, sizeof(buf), t->next_elapse_realtime)); + + if (t->realtime_event_source) { + r = sd_event_source_set_time(t->realtime_event_source, t->next_elapse_realtime); + if (r < 0) + goto fail; + + r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_ONESHOT); + if (r < 0) + goto fail; + } else { + r = sd_event_add_time( + UNIT(t)->manager->event, + &t->realtime_event_source, + t->wake_system ? CLOCK_REALTIME_ALARM : CLOCK_REALTIME, + t->next_elapse_realtime, t->accuracy_usec, + timer_dispatch, t); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(t->realtime_event_source, "timer-realtime"); + } + + } else if (t->realtime_event_source) { + + r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_OFF); + if (r < 0) + goto fail; + } + + timer_set_state(t, TIMER_WAITING); + return; + +fail: + log_unit_warning_errno(UNIT(t), r, "Failed to enter waiting state: %m"); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); +} + +static void timer_enter_running(Timer *t) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Unit *trigger; + int r; + + assert(t); + + /* Don't start job if we are supposed to go down */ + if (unit_stop_pending(UNIT(t))) + return; + + trigger = UNIT_TRIGGER(UNIT(t)); + if (!trigger) { + log_unit_error(UNIT(t), "Unit to trigger vanished."); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); + return; + } + + r = manager_add_job(UNIT(t)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + goto fail; + + dual_timestamp_get(&t->last_trigger); + + if (t->stamp_path) + touch_file(t->stamp_path, true, t->last_trigger.realtime, UID_INVALID, GID_INVALID, MODE_INVALID); + + timer_set_state(t, TIMER_RUNNING); + return; + +fail: + log_unit_warning(UNIT(t), "Failed to queue unit startup job: %s", bus_error_message(&error, r)); + timer_enter_dead(t, TIMER_FAILURE_RESOURCES); +} + +static int timer_start(Unit *u) { + Timer *t = TIMER(u); + TimerValue *v; + int r; + + assert(t); + assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED)); + + r = unit_test_trigger_loaded(u); + if (r < 0) + return r; + + r = unit_test_start_limit(u); + if (r < 0) { + timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT); + return r; + } + + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + + t->last_trigger = DUAL_TIMESTAMP_NULL; + + /* Reenable all timers that depend on unit activation time */ + LIST_FOREACH(value, v, t->values) + if (v->base == TIMER_ACTIVE) + v->disabled = false; + + if (t->stamp_path) { + struct stat st; + + if (stat(t->stamp_path, &st) >= 0) { + usec_t ft; + + /* Load the file timestamp, but only if it is actually in the past. If it is in the future, + * something is wrong with the system clock. */ + + ft = timespec_load(&st.st_mtim); + if (ft < now(CLOCK_REALTIME)) + t->last_trigger.realtime = ft; + else { + char z[FORMAT_TIMESTAMP_MAX]; + + log_unit_warning(u, "Not using persistent file timestamp %s as it is in the future.", + format_timestamp(z, sizeof(z), ft)); + } + + } else if (errno == ENOENT) + /* The timer has never run before, + * make sure a stamp file exists. + */ + (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); + } + + t->result = TIMER_SUCCESS; + timer_enter_waiting(t, false); + return 1; +} + +static int timer_stop(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED)); + + timer_enter_dead(t, TIMER_SUCCESS); + return 1; +} + +static int timer_serialize(Unit *u, FILE *f, FDSet *fds) { + Timer *t = TIMER(u); + + assert(u); + assert(f); + assert(fds); + + (void) serialize_item(f, "state", timer_state_to_string(t->state)); + (void) serialize_item(f, "result", timer_result_to_string(t->result)); + + if (t->last_trigger.realtime > 0) + (void) serialize_usec(f, "last-trigger-realtime", t->last_trigger.realtime); + + if (t->last_trigger.monotonic > 0) + (void) serialize_usec(f, "last-trigger-monotonic", t->last_trigger.monotonic); + + return 0; +} + +static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { + Timer *t = TIMER(u); + + assert(u); + assert(key); + assert(value); + assert(fds); + + if (streq(key, "state")) { + TimerState state; + + state = timer_state_from_string(value); + if (state < 0) + log_unit_debug(u, "Failed to parse state value: %s", value); + else + t->deserialized_state = state; + + } else if (streq(key, "result")) { + TimerResult f; + + f = timer_result_from_string(value); + if (f < 0) + log_unit_debug(u, "Failed to parse result value: %s", value); + else if (f != TIMER_SUCCESS) + t->result = f; + + } else if (streq(key, "last-trigger-realtime")) + (void) deserialize_usec(value, &t->last_trigger.realtime); + else if (streq(key, "last-trigger-monotonic")) + (void) deserialize_usec(value, &t->last_trigger.monotonic); + else + log_unit_debug(u, "Unknown serialization key: %s", key); + + return 0; +} + +_pure_ static UnitActiveState timer_active_state(Unit *u) { + assert(u); + + return state_translation_table[TIMER(u)->state]; +} + +_pure_ static const char *timer_sub_state_to_string(Unit *u) { + assert(u); + + return timer_state_to_string(TIMER(u)->state); +} + +static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) { + Timer *t = TIMER(userdata); + + assert(t); + + if (t->state != TIMER_WAITING) + return 0; + + log_unit_debug(UNIT(t), "Timer elapsed."); + timer_enter_running(t); + return 0; +} + +static void timer_trigger_notify(Unit *u, Unit *other) { + Timer *t = TIMER(u); + TimerValue *v; + + assert(u); + assert(other); + + /* Filter out invocations with bogus state */ + assert(UNIT_IS_LOAD_COMPLETE(other->load_state)); + + /* Reenable all timers that depend on unit state */ + LIST_FOREACH(value, v, t->values) + if (IN_SET(v->base, TIMER_UNIT_ACTIVE, TIMER_UNIT_INACTIVE)) + v->disabled = false; + + switch (t->state) { + + case TIMER_WAITING: + case TIMER_ELAPSED: + + /* Recalculate sleep time */ + timer_enter_waiting(t, false); + break; + + case TIMER_RUNNING: + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) { + log_unit_debug(UNIT(t), "Got notified about unit deactivation."); + timer_enter_waiting(t, false); + } + break; + + case TIMER_DEAD: + case TIMER_FAILED: + break; + + default: + assert_not_reached("Unknown timer state"); + } +} + +static void timer_reset_failed(Unit *u) { + Timer *t = TIMER(u); + + assert(t); + + if (t->state == TIMER_FAILED) + timer_set_state(t, TIMER_DEAD); + + t->result = TIMER_SUCCESS; +} + +static void timer_time_change(Unit *u) { + Timer *t = TIMER(u); + usec_t ts; + + assert(u); + + if (t->state != TIMER_WAITING) + return; + + /* If we appear to have triggered in the future, the system clock must + * have been set backwards. So let's rewind our own clock and allow + * the future trigger(s) to happen again :). Exactly the same as when + * you start a timer unit with Persistent=yes. */ + ts = now(CLOCK_REALTIME); + if (t->last_trigger.realtime > ts) + t->last_trigger.realtime = ts; + + if (t->on_clock_change) { + log_unit_debug(u, "Time change, triggering activation."); + timer_enter_running(t); + } else { + log_unit_debug(u, "Time change, recalculating next elapse."); + timer_enter_waiting(t, true); + } +} + +static void timer_timezone_change(Unit *u) { + Timer *t = TIMER(u); + + assert(u); + + if (t->state != TIMER_WAITING) + return; + + if (t->on_timezone_change) { + log_unit_debug(u, "Timezone change, triggering activation."); + timer_enter_running(t); + } else { + log_unit_debug(u, "Timezone change, recalculating next elapse."); + timer_enter_waiting(t, false); + } +} + +static int timer_clean(Unit *u, ExecCleanMask mask) { + Timer *t = TIMER(u); + int r; + + assert(t); + assert(mask != 0); + + if (t->state != TIMER_DEAD) + return -EBUSY; + + if (!IN_SET(mask, EXEC_CLEAN_STATE)) + return -EUNATCH; + + r = timer_setup_persistent(t); + if (r < 0) + return r; + + if (!t->stamp_path) + return -EUNATCH; + + if (unlink(t->stamp_path) && errno != ENOENT) + return log_unit_error_errno(u, errno, "Failed to clean stamp file of timer: %m"); + + return 0; +} + +static int timer_can_clean(Unit *u, ExecCleanMask *ret) { + Timer *t = TIMER(u); + + assert(t); + + *ret = t->persistent ? EXEC_CLEAN_STATE : 0; + return 0; +} + +static const char* const timer_base_table[_TIMER_BASE_MAX] = { + [TIMER_ACTIVE] = "OnActiveSec", + [TIMER_BOOT] = "OnBootSec", + [TIMER_STARTUP] = "OnStartupSec", + [TIMER_UNIT_ACTIVE] = "OnUnitActiveSec", + [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec", + [TIMER_CALENDAR] = "OnCalendar" +}; + +DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase); + +static const char* const timer_result_table[_TIMER_RESULT_MAX] = { + [TIMER_SUCCESS] = "success", + [TIMER_FAILURE_RESOURCES] = "resources", + [TIMER_FAILURE_START_LIMIT_HIT] = "start-limit-hit", +}; + +DEFINE_STRING_TABLE_LOOKUP(timer_result, TimerResult); + +const UnitVTable timer_vtable = { + .object_size = sizeof(Timer), + + .sections = + "Unit\0" + "Timer\0" + "Install\0", + .private_section = "Timer", + + .can_transient = true, + .can_fail = true, + .can_trigger = true, + + .init = timer_init, + .done = timer_done, + .load = timer_load, + + .coldplug = timer_coldplug, + + .dump = timer_dump, + + .start = timer_start, + .stop = timer_stop, + + .clean = timer_clean, + .can_clean = timer_can_clean, + + .serialize = timer_serialize, + .deserialize_item = timer_deserialize_item, + + .active_state = timer_active_state, + .sub_state_to_string = timer_sub_state_to_string, + + .trigger_notify = timer_trigger_notify, + + .reset_failed = timer_reset_failed, + .time_change = timer_time_change, + .timezone_change = timer_timezone_change, + + .bus_set_property = bus_timer_set_property, +}; diff --git a/src/core/timer.h b/src/core/timer.h new file mode 100644 index 0000000..14fa317 --- /dev/null +++ b/src/core/timer.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Timer Timer; + +#include "calendarspec.h" +#include "unit.h" + +typedef enum TimerBase { + TIMER_ACTIVE, + TIMER_BOOT, + TIMER_STARTUP, + TIMER_UNIT_ACTIVE, + TIMER_UNIT_INACTIVE, + TIMER_CALENDAR, + _TIMER_BASE_MAX, + _TIMER_BASE_INVALID = -1 +} TimerBase; + +typedef struct TimerValue { + TimerBase base; + bool disabled; + + usec_t value; /* only for monotonic events */ + CalendarSpec *calendar_spec; /* only for calendar events */ + usec_t next_elapse; + + LIST_FIELDS(struct TimerValue, value); +} TimerValue; + +typedef enum TimerResult { + TIMER_SUCCESS, + TIMER_FAILURE_RESOURCES, + TIMER_FAILURE_START_LIMIT_HIT, + _TIMER_RESULT_MAX, + _TIMER_RESULT_INVALID = -1 +} TimerResult; + +struct Timer { + Unit meta; + + usec_t accuracy_usec; + usec_t random_usec; + + LIST_HEAD(TimerValue, values); + usec_t next_elapse_realtime; + usec_t next_elapse_monotonic_or_boottime; + dual_timestamp last_trigger; + + TimerState state, deserialized_state; + + sd_event_source *monotonic_event_source; + sd_event_source *realtime_event_source; + + TimerResult result; + + bool persistent; + bool wake_system; + bool remain_after_elapse; + bool on_clock_change; + bool on_timezone_change; + bool fixed_random_delay; + + char *stamp_path; +}; + +#define TIMER_MONOTONIC_CLOCK(t) ((t)->wake_system && clock_boottime_supported() ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC) + +void timer_free_values(Timer *t); + +extern const UnitVTable timer_vtable; + +const char *timer_base_to_string(TimerBase i) _const_; +TimerBase timer_base_from_string(const char *s) _pure_; + +const char* timer_result_to_string(TimerResult i) _const_; +TimerResult timer_result_from_string(const char *s) _pure_; + +DEFINE_CAST(TIMER, Timer); diff --git a/src/core/transaction.c b/src/core/transaction.c new file mode 100644 index 0000000..ae77bae --- /dev/null +++ b/src/core/transaction.c @@ -0,0 +1,1201 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fcntl.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "dbus-unit.h" +#include "strv.h" +#include "terminal-util.h" +#include "transaction.h" + +static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies); + +static void transaction_delete_job(Transaction *tr, Job *j, bool delete_dependencies) { + assert(tr); + assert(j); + + /* Deletes one job from the transaction */ + + transaction_unlink_job(tr, j, delete_dependencies); + + job_free(j); +} + +static void transaction_delete_unit(Transaction *tr, Unit *u) { + Job *j; + + /* Deletes all jobs associated with a certain unit from the + * transaction */ + + while ((j = hashmap_get(tr->jobs, u))) + transaction_delete_job(tr, j, true); +} + +void transaction_abort(Transaction *tr) { + Job *j; + + assert(tr); + + while ((j = hashmap_first(tr->jobs))) + transaction_delete_job(tr, j, false); + + assert(hashmap_isempty(tr->jobs)); +} + +static void transaction_find_jobs_that_matter_to_anchor(Job *j, unsigned generation) { + JobDependency *l; + + /* A recursive sweep through the graph that marks all units + * that matter to the anchor job, i.e. are directly or + * indirectly a dependency of the anchor job via paths that + * are fully marked as mattering. */ + + j->matters_to_anchor = true; + j->generation = generation; + + LIST_FOREACH(subject, l, j->subject_list) { + + /* This link does not matter */ + if (!l->matters) + continue; + + /* This unit has already been marked */ + if (l->object->generation == generation) + continue; + + transaction_find_jobs_that_matter_to_anchor(l->object, generation); + } +} + +static void transaction_merge_and_delete_job(Transaction *tr, Job *j, Job *other, JobType t) { + JobDependency *l, *last; + + assert(j); + assert(other); + assert(j->unit == other->unit); + assert(!j->installed); + + /* Merges 'other' into 'j' and then deletes 'other'. */ + + j->type = t; + j->state = JOB_WAITING; + j->irreversible = j->irreversible || other->irreversible; + j->matters_to_anchor = j->matters_to_anchor || other->matters_to_anchor; + + /* Patch us in as new owner of the JobDependency objects */ + last = NULL; + LIST_FOREACH(subject, l, other->subject_list) { + assert(l->subject == other); + l->subject = j; + last = l; + } + + /* Merge both lists */ + if (last) { + last->subject_next = j->subject_list; + if (j->subject_list) + j->subject_list->subject_prev = last; + j->subject_list = other->subject_list; + } + + /* Patch us in as new owner of the JobDependency objects */ + last = NULL; + LIST_FOREACH(object, l, other->object_list) { + assert(l->object == other); + l->object = j; + last = l; + } + + /* Merge both lists */ + if (last) { + last->object_next = j->object_list; + if (j->object_list) + j->object_list->object_prev = last; + j->object_list = other->object_list; + } + + /* Kill the other job */ + other->subject_list = NULL; + other->object_list = NULL; + transaction_delete_job(tr, other, true); +} + +_pure_ static bool job_is_conflicted_by(Job *j) { + JobDependency *l; + + assert(j); + + /* Returns true if this job is pulled in by a least one + * ConflictedBy dependency. */ + + LIST_FOREACH(object, l, j->object_list) + if (l->conflicts) + return true; + + return false; +} + +static int delete_one_unmergeable_job(Transaction *tr, Job *j) { + Job *k; + + assert(j); + + /* Tries to delete one item in the linked list + * j->transaction_next->transaction_next->... that conflicts + * with another one, in an attempt to make an inconsistent + * transaction work. */ + + /* We rely here on the fact that if a merged with b does not + * merge with c, either a or b merge with c neither */ + LIST_FOREACH(transaction, j, j) + LIST_FOREACH(transaction, k, j->transaction_next) { + Job *d; + + /* Is this one mergeable? Then skip it */ + if (job_type_is_mergeable(j->type, k->type)) + continue; + + /* Ok, we found two that conflict, let's see if we can + * drop one of them */ + if (!j->matters_to_anchor && !k->matters_to_anchor) { + + /* Both jobs don't matter, so let's + * find the one that is smarter to + * remove. Let's think positive and + * rather remove stops then starts -- + * except if something is being + * stopped because it is conflicted by + * another unit in which case we + * rather remove the start. */ + + log_unit_debug(j->unit, + "Looking at job %s/%s conflicted_by=%s", + j->unit->id, job_type_to_string(j->type), + yes_no(j->type == JOB_STOP && job_is_conflicted_by(j))); + log_unit_debug(k->unit, + "Looking at job %s/%s conflicted_by=%s", + k->unit->id, job_type_to_string(k->type), + yes_no(k->type == JOB_STOP && job_is_conflicted_by(k))); + + if (j->type == JOB_STOP) { + + if (job_is_conflicted_by(j)) + d = k; + else + d = j; + + } else if (k->type == JOB_STOP) { + + if (job_is_conflicted_by(k)) + d = j; + else + d = k; + } else + d = j; + + } else if (!j->matters_to_anchor) + d = j; + else if (!k->matters_to_anchor) + d = k; + else + return -ENOEXEC; + + /* Ok, we can drop one, so let's do so. */ + log_unit_debug(d->unit, + "Fixing conflicting jobs %s/%s,%s/%s by deleting job %s/%s", + j->unit->id, job_type_to_string(j->type), + k->unit->id, job_type_to_string(k->type), + d->unit->id, job_type_to_string(d->type)); + transaction_delete_job(tr, d, true); + return 0; + } + + return -EINVAL; +} + +static int transaction_merge_jobs(Transaction *tr, sd_bus_error *e) { + Job *j; + int r; + + assert(tr); + + /* First step, check whether any of the jobs for one specific + * task conflict. If so, try to drop one of them. */ + HASHMAP_FOREACH(j, tr->jobs) { + JobType t; + Job *k; + + t = j->type; + LIST_FOREACH(transaction, k, j->transaction_next) { + if (job_type_merge_and_collapse(&t, k->type, j->unit) >= 0) + continue; + + /* OK, we could not merge all jobs for this + * action. Let's see if we can get rid of one + * of them */ + + r = delete_one_unmergeable_job(tr, j); + if (r >= 0) + /* Ok, we managed to drop one, now + * let's ask our callers to call us + * again after garbage collecting */ + return -EAGAIN; + + /* We couldn't merge anything. Failure */ + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_JOBS_CONFLICTING, + "Transaction contains conflicting jobs '%s' and '%s' for %s. " + "Probably contradicting requirement dependencies configured.", + job_type_to_string(t), + job_type_to_string(k->type), + k->unit->id); + } + } + + /* Second step, merge the jobs. */ + HASHMAP_FOREACH(j, tr->jobs) { + JobType t = j->type; + Job *k; + + /* Merge all transaction jobs for j->unit */ + LIST_FOREACH(transaction, k, j->transaction_next) + assert_se(job_type_merge_and_collapse(&t, k->type, j->unit) == 0); + + while ((k = j->transaction_next)) { + if (tr->anchor_job == k) { + transaction_merge_and_delete_job(tr, k, j, t); + j = k; + } else + transaction_merge_and_delete_job(tr, j, k, t); + } + + assert(!j->transaction_next); + assert(!j->transaction_prev); + } + + return 0; +} + +static void transaction_drop_redundant(Transaction *tr) { + bool again; + + /* Goes through the transaction and removes all jobs of the units whose jobs are all noops. If not + * all of a unit's jobs are redundant, they are kept. */ + + assert(tr); + + do { + Job *j; + + again = false; + + HASHMAP_FOREACH(j, tr->jobs) { + bool keep = false; + Job *k; + + LIST_FOREACH(transaction, k, j) + if (tr->anchor_job == k || + !job_type_is_redundant(k->type, unit_active_state(k->unit)) || + (k->unit->job && job_type_is_conflicting(k->type, k->unit->job->type))) { + keep = true; + break; + } + + if (!keep) { + log_trace("Found redundant job %s/%s, dropping from transaction.", + j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, j, false); + again = true; + break; + } + } + } while (again); +} + +_pure_ static bool unit_matters_to_anchor(Unit *u, Job *j) { + assert(u); + assert(!j->transaction_prev); + + /* Checks whether at least one of the jobs for this unit + * matters to the anchor. */ + + LIST_FOREACH(transaction, j, j) + if (j->matters_to_anchor) + return true; + + return false; +} + +static char* merge_unit_ids(const char* unit_log_field, char **pairs) { + char **unit_id, **job_type, *ans = NULL; + size_t alloc = 0, size = 0, next; + + STRV_FOREACH_PAIR(unit_id, job_type, pairs) { + next = strlen(unit_log_field) + strlen(*unit_id); + if (!GREEDY_REALLOC(ans, alloc, size + next + 1)) + return mfree(ans); + + sprintf(ans + size, "%s%s", unit_log_field, *unit_id); + if (*(unit_id+1)) + ans[size + next] = '\n'; + size += next + 1; + } + + return ans; +} + +static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsigned generation, sd_bus_error *e) { + Unit *u; + void *v; + int r; + static const UnitDependency directions[] = { + UNIT_BEFORE, + UNIT_AFTER, + }; + size_t d; + + assert(tr); + assert(j); + assert(!j->transaction_prev); + + /* Does a recursive sweep through the ordering graph, looking + * for a cycle. If we find a cycle we try to break it. */ + + /* Have we seen this before? */ + if (j->generation == generation) { + Job *k, *delete = NULL; + _cleanup_free_ char **array = NULL, *unit_ids = NULL; + char **unit_id, **job_type; + + /* If the marker is NULL we have been here already and + * decided the job was loop-free from here. Hence + * shortcut things and return right-away. */ + if (!j->marker) + return 0; + + /* So, the marker is not NULL and we already have been here. We have + * a cycle. Let's try to break it. We go backwards in our path and + * try to find a suitable job to remove. We use the marker to find + * our way back, since smart how we are we stored our way back in + * there. */ + + for (k = from; k; k = ((k->generation == generation && k->marker != k) ? k->marker : NULL)) { + + /* For logging below */ + if (strv_push_pair(&array, k->unit->id, (char*) job_type_to_string(k->type)) < 0) + log_oom(); + + if (!delete && hashmap_get(tr->jobs, k->unit) && !unit_matters_to_anchor(k->unit, k)) + /* Ok, we can drop this one, so let's do so. */ + delete = k; + + /* Check if this in fact was the beginning of the cycle */ + if (k == j) + break; + } + + unit_ids = merge_unit_ids(j->manager->unit_log_field, array); /* ignore error */ + + STRV_FOREACH_PAIR(unit_id, job_type, array) + /* logging for j not k here to provide a consistent narrative */ + log_struct(LOG_WARNING, + "MESSAGE=%s: Found %s on %s/%s", + j->unit->id, + unit_id == array ? "ordering cycle" : "dependency", + *unit_id, *job_type, + unit_ids); + + if (delete) { + const char *status; + /* logging for j not k here to provide a consistent narrative */ + log_struct(LOG_ERR, + "MESSAGE=%s: Job %s/%s deleted to break ordering cycle starting with %s/%s", + j->unit->id, delete->unit->id, job_type_to_string(delete->type), + j->unit->id, job_type_to_string(j->type), + unit_ids); + + if (log_get_show_color()) + status = ANSI_HIGHLIGHT_RED " SKIP " ANSI_NORMAL; + else + status = " SKIP "; + + unit_status_printf(delete->unit, + STATUS_TYPE_NOTICE, + status, + "Ordering cycle found, skipping %s"); + transaction_delete_unit(tr, delete->unit); + return -EAGAIN; + } + + log_struct(LOG_ERR, + "MESSAGE=%s: Unable to break cycle starting with %s/%s", + j->unit->id, j->unit->id, job_type_to_string(j->type), + unit_ids); + + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC, + "Transaction order is cyclic. See system logs for details."); + } + + /* Make the marker point to where we come from, so that we can + * find our way backwards if we want to break a cycle. We use + * a special marker for the beginning: we point to + * ourselves. */ + j->marker = from ? from : j; + j->generation = generation; + + /* Actual ordering of jobs depends on the unit ordering dependency and job types. We need to traverse + * the graph over 'before' edges in the actual job execution order. We traverse over both unit + * ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job + * execution ordering. */ + for (d = 0; d < ELEMENTSOF(directions); d++) { + HASHMAP_FOREACH_KEY(v, u, j->unit->dependencies[directions[d]]) { + Job *o; + + /* Is there a job for this unit? */ + o = hashmap_get(tr->jobs, u); + if (!o) { + /* Ok, there is no job for this in the + * transaction, but maybe there is already one + * running? */ + o = u->job; + if (!o) + continue; + } + + /* Cut traversing if the job j is not really *before* o. */ + if (job_compare(j, o, directions[d]) >= 0) + continue; + + r = transaction_verify_order_one(tr, o, j, generation, e); + if (r < 0) + return r; + } + } + + /* Ok, let's backtrack, and remember that this entry is not on + * our path anymore. */ + j->marker = NULL; + + return 0; +} + +static int transaction_verify_order(Transaction *tr, unsigned *generation, sd_bus_error *e) { + Job *j; + int r; + unsigned g; + + assert(tr); + assert(generation); + + /* Check if the ordering graph is cyclic. If it is, try to fix + * that up by dropping one of the jobs. */ + + g = (*generation)++; + + HASHMAP_FOREACH(j, tr->jobs) { + r = transaction_verify_order_one(tr, j, NULL, g, e); + if (r < 0) + return r; + } + + return 0; +} + +static void transaction_collect_garbage(Transaction *tr) { + bool again; + + assert(tr); + + /* Drop jobs that are not required by any other job */ + + do { + Job *j; + + again = false; + + HASHMAP_FOREACH(j, tr->jobs) { + if (tr->anchor_job == j) + continue; + + if (!j->object_list) { + log_trace("Garbage collecting job %s/%s", j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, j, true); + again = true; + break; + } + + log_trace("Keeping job %s/%s because of %s/%s", + j->unit->id, job_type_to_string(j->type), + j->object_list->subject ? j->object_list->subject->unit->id : "root", + j->object_list->subject ? job_type_to_string(j->object_list->subject->type) : "root"); + } + + } while (again); +} + +static int transaction_is_destructive(Transaction *tr, JobMode mode, sd_bus_error *e) { + Job *j; + + assert(tr); + + /* Checks whether applying this transaction means that + * existing jobs would be replaced */ + + HASHMAP_FOREACH(j, tr->jobs) { + + /* Assume merged */ + assert(!j->transaction_prev); + assert(!j->transaction_next); + + if (j->unit->job && (mode == JOB_FAIL || j->unit->job->irreversible) && + job_type_is_conflicting(j->unit->job->type, j->type)) + return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE, + "Transaction for %s/%s is destructive (%s has '%s' job queued, but '%s' is included in transaction).", + tr->anchor_job->unit->id, job_type_to_string(tr->anchor_job->type), + j->unit->id, job_type_to_string(j->unit->job->type), job_type_to_string(j->type)); + } + + return 0; +} + +static void transaction_minimize_impact(Transaction *tr) { + Job *j; + + assert(tr); + + /* Drops all unnecessary jobs that reverse already active jobs + * or that stop a running service. */ + +rescan: + HASHMAP_FOREACH(j, tr->jobs) { + LIST_FOREACH(transaction, j, j) { + bool stops_running_service, changes_existing_job; + + /* If it matters, we shouldn't drop it */ + if (j->matters_to_anchor) + continue; + + /* Would this stop a running service? + * Would this change an existing job? + * If so, let's drop this entry */ + + stops_running_service = + j->type == JOB_STOP && UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(j->unit)); + + changes_existing_job = + j->unit->job && + job_type_is_conflicting(j->type, j->unit->job->type); + + if (!stops_running_service && !changes_existing_job) + continue; + + if (stops_running_service) + log_unit_debug(j->unit, + "%s/%s would stop a running service.", + j->unit->id, job_type_to_string(j->type)); + + if (changes_existing_job) + log_unit_debug(j->unit, + "%s/%s would change existing job.", + j->unit->id, job_type_to_string(j->type)); + + /* Ok, let's get rid of this */ + log_unit_debug(j->unit, + "Deleting %s/%s to minimize impact.", + j->unit->id, job_type_to_string(j->type)); + + transaction_delete_job(tr, j, true); + goto rescan; + } + } +} + +static int transaction_apply( + Transaction *tr, + Manager *m, + JobMode mode, + Set *affected_jobs) { + + Job *j; + int r; + + /* Moves the transaction jobs to the set of active jobs */ + + if (IN_SET(mode, JOB_ISOLATE, JOB_FLUSH)) { + + /* When isolating first kill all installed jobs which + * aren't part of the new transaction */ + HASHMAP_FOREACH(j, m->jobs) { + assert(j->installed); + + if (j->unit->ignore_on_isolate) + continue; + + if (hashmap_get(tr->jobs, j->unit)) + continue; + + /* Not invalidating recursively. Avoids triggering + * OnFailure= actions of dependent jobs. Also avoids + * invalidating our iterator. */ + job_finish_and_invalidate(j, JOB_CANCELED, false, false); + } + } + + HASHMAP_FOREACH(j, tr->jobs) { + /* Assume merged */ + assert(!j->transaction_prev); + assert(!j->transaction_next); + + r = hashmap_ensure_allocated(&m->jobs, NULL); + if (r < 0) + return r; + + r = hashmap_put(m->jobs, UINT32_TO_PTR(j->id), j); + if (r < 0) + goto rollback; + } + + while ((j = hashmap_steal_first(tr->jobs))) { + Job *installed_job; + + /* Clean the job dependencies */ + transaction_unlink_job(tr, j, false); + + installed_job = job_install(j); + if (installed_job != j) { + /* j has been merged into a previously installed job */ + if (tr->anchor_job == j) + tr->anchor_job = installed_job; + hashmap_remove(m->jobs, UINT32_TO_PTR(j->id)); + job_free(j); + j = installed_job; + } + + job_add_to_run_queue(j); + job_add_to_dbus_queue(j); + job_start_timer(j, false); + job_shutdown_magic(j); + + /* When 'affected' is specified, let's track all in it all jobs that were touched because of + * this transaction. */ + if (affected_jobs) + (void) set_put(affected_jobs, j); + } + + return 0; + +rollback: + + HASHMAP_FOREACH(j, tr->jobs) + hashmap_remove(m->jobs, UINT32_TO_PTR(j->id)); + + return r; +} + +int transaction_activate( + Transaction *tr, + Manager *m, + JobMode mode, + Set *affected_jobs, + sd_bus_error *e) { + + Job *j; + int r; + unsigned generation = 1; + + assert(tr); + + /* This applies the changes recorded in tr->jobs to + * the actual list of jobs, if possible. */ + + /* Reset the generation counter of all installed jobs. The detection of cycles + * looks at installed jobs. If they had a non-zero generation from some previous + * walk of the graph, the algorithm would break. */ + HASHMAP_FOREACH(j, m->jobs) + j->generation = 0; + + /* First step: figure out which jobs matter */ + transaction_find_jobs_that_matter_to_anchor(tr->anchor_job, generation++); + + /* Second step: Try not to stop any running services if + * we don't have to. Don't try to reverse running + * jobs if we don't have to. */ + if (mode == JOB_FAIL) + transaction_minimize_impact(tr); + + /* Third step: Drop redundant jobs */ + transaction_drop_redundant(tr); + + for (;;) { + /* Fourth step: Let's remove unneeded jobs that might + * be lurking. */ + if (mode != JOB_ISOLATE) + transaction_collect_garbage(tr); + + /* Fifth step: verify order makes sense and correct + * cycles if necessary and possible */ + r = transaction_verify_order(tr, &generation, e); + if (r >= 0) + break; + + if (r != -EAGAIN) + return log_warning_errno(r, "Requested transaction contains an unfixable cyclic ordering dependency: %s", bus_error_message(e, r)); + + /* Let's see if the resulting transaction ordering + * graph is still cyclic... */ + } + + for (;;) { + /* Sixth step: let's drop unmergeable entries if + * necessary and possible, merge entries we can + * merge */ + r = transaction_merge_jobs(tr, e); + if (r >= 0) + break; + + if (r != -EAGAIN) + return log_warning_errno(r, "Requested transaction contains unmergeable jobs: %s", bus_error_message(e, r)); + + /* Seventh step: an entry got dropped, let's garbage + * collect its dependencies. */ + if (mode != JOB_ISOLATE) + transaction_collect_garbage(tr); + + /* Let's see if the resulting transaction still has + * unmergeable entries ... */ + } + + /* Eights step: Drop redundant jobs again, if the merging now allows us to drop more. */ + transaction_drop_redundant(tr); + + /* Ninth step: check whether we can actually apply this */ + r = transaction_is_destructive(tr, mode, e); + if (r < 0) + return log_notice_errno(r, "Requested transaction contradicts existing jobs: %s", bus_error_message(e, r)); + + /* Tenth step: apply changes */ + r = transaction_apply(tr, m, mode, affected_jobs); + if (r < 0) + return log_warning_errno(r, "Failed to apply transaction: %m"); + + assert(hashmap_isempty(tr->jobs)); + + if (!hashmap_isempty(m->jobs)) { + /* Are there any jobs now? Then make sure we have the + * idle pipe around. We don't really care too much + * whether this works or not, as the idle pipe is a + * feature for cosmetics, not actually useful for + * anything beyond that. */ + + if (m->idle_pipe[0] < 0 && m->idle_pipe[1] < 0 && + m->idle_pipe[2] < 0 && m->idle_pipe[3] < 0) { + (void) pipe2(m->idle_pipe, O_NONBLOCK|O_CLOEXEC); + (void) pipe2(m->idle_pipe + 2, O_NONBLOCK|O_CLOEXEC); + } + } + + return 0; +} + +static Job* transaction_add_one_job(Transaction *tr, JobType type, Unit *unit, bool *is_new) { + Job *j, *f; + + assert(tr); + assert(unit); + + /* Looks for an existing prospective job and returns that. If + * it doesn't exist it is created and added to the prospective + * jobs list. */ + + f = hashmap_get(tr->jobs, unit); + + LIST_FOREACH(transaction, j, f) { + assert(j->unit == unit); + + if (j->type == type) { + if (is_new) + *is_new = false; + return j; + } + } + + j = job_new(unit, type); + if (!j) + return NULL; + + j->generation = 0; + j->marker = NULL; + j->matters_to_anchor = false; + j->irreversible = tr->irreversible; + + LIST_PREPEND(transaction, f, j); + + if (hashmap_replace(tr->jobs, unit, f) < 0) { + LIST_REMOVE(transaction, f, j); + job_free(j); + return NULL; + } + + if (is_new) + *is_new = true; + + log_trace("Added job %s/%s to transaction.", unit->id, job_type_to_string(type)); + + return j; +} + +static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies) { + assert(tr); + assert(j); + + if (j->transaction_prev) + j->transaction_prev->transaction_next = j->transaction_next; + else if (j->transaction_next) + hashmap_replace(tr->jobs, j->unit, j->transaction_next); + else + hashmap_remove_value(tr->jobs, j->unit, j); + + if (j->transaction_next) + j->transaction_next->transaction_prev = j->transaction_prev; + + j->transaction_prev = j->transaction_next = NULL; + + while (j->subject_list) + job_dependency_free(j->subject_list); + + while (j->object_list) { + Job *other = j->object_list->matters ? j->object_list->subject : NULL; + + job_dependency_free(j->object_list); + + if (other && delete_dependencies) { + log_unit_debug(other->unit, + "Deleting job %s/%s as dependency of job %s/%s", + other->unit->id, job_type_to_string(other->type), + j->unit->id, job_type_to_string(j->type)); + transaction_delete_job(tr, other, delete_dependencies); + } + } +} + +void transaction_add_propagate_reload_jobs(Transaction *tr, Unit *unit, Job *by, bool ignore_order, sd_bus_error *e) { + JobType nt; + Unit *dep; + void *v; + int r; + + assert(tr); + assert(unit); + + HASHMAP_FOREACH_KEY(v, dep, unit->dependencies[UNIT_PROPAGATES_RELOAD_TO]) { + nt = job_type_collapse(JOB_TRY_RELOAD, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, by, false, false, false, ignore_order, e); + if (r < 0) { + log_unit_warning(dep, + "Cannot add dependency reload job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } +} + +int transaction_add_job_and_dependencies( + Transaction *tr, + JobType type, + Unit *unit, + Job *by, + bool matters, + bool conflicts, + bool ignore_requirements, + bool ignore_order, + sd_bus_error *e) { + + bool is_new; + Unit *dep; + Job *ret; + void *v; + int r; + + assert(tr); + assert(type < _JOB_TYPE_MAX); + assert(type < _JOB_TYPE_MAX_IN_TRANSACTION); + assert(unit); + + /* Before adding jobs for this unit, let's ensure that its state has been loaded + * This matters when jobs are spawned as part of coldplugging itself (see e. g. path_coldplug()). + * This way, we "recursively" coldplug units, ensuring that we do not look at state of + * not-yet-coldplugged units. */ + if (MANAGER_IS_RELOADING(unit->manager)) + unit_coldplug(unit); + + if (by) + log_trace("Pulling in %s/%s from %s/%s", unit->id, job_type_to_string(type), by->unit->id, job_type_to_string(by->type)); + + /* Safety check that the unit is a valid state, i.e. not in UNIT_STUB or UNIT_MERGED which should only be set + * temporarily. */ + if (!UNIT_IS_LOAD_COMPLETE(unit->load_state)) + return sd_bus_error_setf(e, BUS_ERROR_LOAD_FAILED, "Unit %s is not loaded properly.", unit->id); + + if (type != JOB_STOP) { + r = bus_unit_validate_load_state(unit, e); + /* The time-based cache allows to start new units without daemon-reload, + * but if they are already referenced (because of dependencies or ordering) + * then we have to force a load of the fragment. As an optimization, check + * first if anything in the usual paths was modified since the last time + * the cache was loaded. Also check if the last time an attempt to load the + * unit was made was before the most recent cache refresh, so that we know + * we need to try again — even if the cache is current, it might have been + * updated in a different context before we had a chance to retry loading + * this particular unit. + * + * Given building up the transaction is a synchronous operation, attempt + * to load the unit immediately. */ + if (r < 0 && manager_unit_cache_should_retry_load(unit)) { + sd_bus_error_free(e); + unit->load_state = UNIT_STUB; + r = unit_load(unit); + if (r < 0 || unit->load_state == UNIT_STUB) + unit->load_state = UNIT_NOT_FOUND; + r = bus_unit_validate_load_state(unit, e); + } + if (r < 0) + return r; + } + + if (!unit_job_is_applicable(unit, type)) + return sd_bus_error_setf(e, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE, + "Job type %s is not applicable for unit %s.", + job_type_to_string(type), unit->id); + + /* First add the job. */ + ret = transaction_add_one_job(tr, type, unit, &is_new); + if (!ret) + return -ENOMEM; + + ret->ignore_order = ret->ignore_order || ignore_order; + + /* Then, add a link to the job. */ + if (by) { + if (!job_dependency_new(by, ret, matters, conflicts)) + return -ENOMEM; + } else { + /* If the job has no parent job, it is the anchor job. */ + assert(!tr->anchor_job); + tr->anchor_job = ret; + } + + if (is_new && !ignore_requirements && type != JOB_NOP) { + Set *following; + + /* If we are following some other unit, make sure we + * add all dependencies of everybody following. */ + if (unit_following_set(ret->unit, &following) > 0) { + SET_FOREACH(dep, following) { + r = transaction_add_job_and_dependencies(tr, type, dep, ret, false, false, false, ignore_order, e); + if (r < 0) { + log_unit_full_errno(dep, r == -ERFKILL ? LOG_INFO : LOG_WARNING, r, + "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + set_free(following); + } + + /* Finally, recursively add in all dependencies. */ + if (IN_SET(type, JOB_START, JOB_RESTART)) { + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_REQUIRES]) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_BINDS_TO]) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_WANTS]) { + r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, false, false, false, ignore_order, e); + if (r < 0) { + /* unit masked, job type not applicable and unit not found are not considered as errors. */ + log_unit_full_errno(dep, + IN_SET(r, -ERFKILL, -EBADR, -ENOENT) ? LOG_DEBUG : LOG_WARNING, + r, "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_REQUISITE]) { + r = transaction_add_job_and_dependencies(tr, JOB_VERIFY_ACTIVE, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_CONFLICTS]) { + r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, true, true, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[UNIT_CONFLICTED_BY]) { + r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, false, false, false, ignore_order, e); + if (r < 0) { + log_unit_warning(dep, + "Cannot add dependency job, ignoring: %s", + bus_error_message(e, r)); + sd_bus_error_free(e); + } + } + + } + + if (IN_SET(type, JOB_STOP, JOB_RESTART)) { + static const UnitDependency propagate_deps[] = { + UNIT_REQUIRED_BY, + UNIT_REQUISITE_OF, + UNIT_BOUND_BY, + UNIT_CONSISTS_OF, + }; + + JobType ptype; + unsigned j; + + /* We propagate STOP as STOP, but RESTART only + * as TRY_RESTART, in order not to start + * dependencies that are not around. */ + ptype = type == JOB_RESTART ? JOB_TRY_RESTART : type; + + for (j = 0; j < ELEMENTSOF(propagate_deps); j++) + HASHMAP_FOREACH_KEY(v, dep, ret->unit->dependencies[propagate_deps[j]]) { + JobType nt; + + nt = job_type_collapse(ptype, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, ret, true, false, false, ignore_order, e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ + goto fail; + + sd_bus_error_free(e); + } + } + } + + if (type == JOB_RELOAD) + transaction_add_propagate_reload_jobs(tr, ret->unit, ret, ignore_order, e); + + /* JOB_VERIFY_ACTIVE requires no dependency handling */ + } + + return 0; + +fail: + return r; +} + +int transaction_add_isolate_jobs(Transaction *tr, Manager *m) { + Unit *u; + char *k; + int r; + + assert(tr); + assert(m); + + HASHMAP_FOREACH_KEY(u, k, m->units) { + + /* ignore aliases */ + if (u->id != k) + continue; + + if (u->ignore_on_isolate) + continue; + + /* No need to stop inactive jobs */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) && !u->job) + continue; + + /* Is there already something listed for this? */ + if (hashmap_get(tr->jobs, u)) + continue; + + r = transaction_add_job_and_dependencies(tr, JOB_STOP, u, tr->anchor_job, true, false, false, false, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add isolate job, ignoring: %m"); + } + + return 0; +} + +int transaction_add_triggering_jobs(Transaction *tr, Unit *u) { + void *v; + Unit *trigger; + int r; + + assert(tr); + assert(u); + + HASHMAP_FOREACH_KEY(v, trigger, u->dependencies[UNIT_TRIGGERED_BY]) { + /* No need to stop inactive jobs */ + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger)) && !trigger->job) + continue; + + /* Is there already something listed for this? */ + if (hashmap_get(tr->jobs, trigger)) + continue; + + r = transaction_add_job_and_dependencies(tr, JOB_STOP, trigger, tr->anchor_job, true, false, false, false, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Cannot add triggered by job, ignoring: %m"); + } + + return 0; +} + +Transaction *transaction_new(bool irreversible) { + Transaction *tr; + + tr = new0(Transaction, 1); + if (!tr) + return NULL; + + tr->jobs = hashmap_new(NULL); + if (!tr->jobs) + return mfree(tr); + + tr->irreversible = irreversible; + + return tr; +} + +void transaction_free(Transaction *tr) { + assert(hashmap_isempty(tr->jobs)); + hashmap_free(tr->jobs); + free(tr); +} diff --git a/src/core/transaction.h b/src/core/transaction.h new file mode 100644 index 0000000..c431271 --- /dev/null +++ b/src/core/transaction.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Transaction Transaction; + +#include "hashmap.h" +#include "job.h" +#include "manager.h" +#include "unit.h" + +struct Transaction { + /* Jobs to be added */ + Hashmap *jobs; /* Unit object => Job object list 1:1 */ + Job *anchor_job; /* the job the user asked for */ + bool irreversible; +}; + +Transaction *transaction_new(bool irreversible); +void transaction_free(Transaction *tr); + +void transaction_add_propagate_reload_jobs(Transaction *tr, Unit *unit, Job *by, bool ignore_order, sd_bus_error *e); +int transaction_add_job_and_dependencies( + Transaction *tr, + JobType type, + Unit *unit, + Job *by, + bool matters, + bool conflicts, + bool ignore_requirements, + bool ignore_order, + sd_bus_error *e); +int transaction_activate(Transaction *tr, Manager *m, JobMode mode, Set *affected, sd_bus_error *e); +int transaction_add_isolate_jobs(Transaction *tr, Manager *m); +int transaction_add_triggering_jobs(Transaction *tr, Unit *u); +void transaction_abort(Transaction *tr); diff --git a/src/core/triggers.systemd.in b/src/core/triggers.systemd.in new file mode 100644 index 0000000..2d25db3 --- /dev/null +++ b/src/core/triggers.systemd.in @@ -0,0 +1,143 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# Copyright © 2018 Neal Gompa + +# The contents of this are an example to be copied into systemd.spec. +# +# Minimum rpm version supported: 4.13.0 + +%transfiletriggerin -P 900900 -p <lua> -- @systemunitdir@ /etc/systemd/system +-- This script will run after any package is initially installed or +-- upgraded. We care about the case where a package is initially +-- installed, because other cases are covered by the *un scriptlets, +-- so sometimes we will reload needlessly. + +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemctl", "daemon-reload")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerun -p <lua> -- @systemunitdir@ /etc/systemd/system +-- On removal, we need to run daemon-reload after any units have been +-- removed. %transfiletriggerpostun would be ideal, but it does not get +-- executed for some reason. +-- On upgrade, we need to run daemon-reload after any new unit files +-- have been installed, but before %postun scripts in packages get +-- executed. %transfiletriggerun gets the right list of files +-- but it is invoked too early (before changes happen). +-- %filetriggerpostun happens at the right time, but it fires for +-- every package. +-- To execute the reload at the right time, we create a state +-- file in %transfiletriggerun and execute the daemon-reload in +-- the first %filetriggerpostun. + +if posix.access("/run/systemd/system") then + posix.mkdir("%{_localstatedir}/lib") + posix.mkdir("%{_localstatedir}/lib/rpm-state") + posix.mkdir("%{_localstatedir}/lib/rpm-state/systemd") + io.open("%{_localstatedir}/lib/rpm-state/systemd/needs-reload", "w") +end + +%filetriggerpostun -P 1000100 -p <lua> -- @systemunitdir@ /etc/systemd/system +if posix.access("%{_localstatedir}/lib/rpm-state/systemd/needs-reload") then + posix.unlink("%{_localstatedir}/lib/rpm-state/systemd/needs-reload") + posix.rmdir("%{_localstatedir}/lib/rpm-state/systemd") + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemctl", "daemon-reload")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -P 100700 -p <lua> -- @sysusersdir@ +-- This script will process files installed in @sysusersdir@ to create +-- specified users automatically. The priority is set such that it +-- will run before the tmpfiles file trigger. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemd-sysusers")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -P 100500 -p <lua> -- @tmpfilesdir@ +-- This script will process files installed in @tmpfilesdir@ to create +-- tmpfiles automatically. The priority is set such that it will run +-- after the sysusers file trigger, but before any other triggers. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemd-tmpfiles", "--create")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @udevhwdbdir@ +-- This script will automatically invoke hwdb update if files have been +-- installed or updated in @udevhwdbdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/systemd-hwdb", "update")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @catalogdir@ +-- This script will automatically invoke journal catalog update if files +-- have been installed or updated in @catalogdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/journalctl", "--update-catalog")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @udevrulesdir@ +-- This script will automatically update udev with new rules if files +-- have been installed or updated in @udevrulesdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("%{_bindir}/udevadm", "control", "--reload")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @sysctldir@ +-- This script will automatically apply sysctl rules if files have been +-- installed or updated in @sysctldir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("@rootlibexecdir@/systemd-sysctl")) + elseif pid > 0 then + posix.wait(pid) + end +end + +%transfiletriggerin -p <lua> -- @binfmtdir@ +-- This script will automatically apply binfmt rules if files have been +-- installed or updated in @binfmtdir@. +if posix.access("/run/systemd/system") then + pid = posix.fork() + if pid == 0 then + assert(posix.exec("@rootlibexecdir@/systemd-binfmt")) + elseif pid > 0 then + posix.wait(pid) + end +end diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c new file mode 100644 index 0000000..0c1e20d --- /dev/null +++ b/src/core/unit-printf.c @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "format-util.h" +#include "macro.h" +#include "specifier.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "unit.h" +#include "user-util.h" + +static int specifier_prefix_and_instance(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + return unit_name_to_prefix_and_instance(u->id, ret); +} + +static int specifier_prefix(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + return unit_name_to_prefix(u->id, ret); +} + +static int specifier_prefix_unescaped(char specifier, const void *data, const void *userdata, char **ret) { + _cleanup_free_ char *p = NULL; + const Unit *u = userdata; + int r; + + assert(u); + + r = unit_name_to_prefix(u->id, &p); + if (r < 0) + return r; + + return unit_name_unescape(p, ret); +} + +static int specifier_instance_unescaped(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + return unit_name_unescape(strempty(u->instance), ret); +} + +static int specifier_last_component(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + _cleanup_free_ char *prefix = NULL; + char *dash; + int r; + + assert(u); + + r = unit_name_to_prefix(u->id, &prefix); + if (r < 0) + return r; + + dash = strrchr(prefix, '-'); + if (dash) + return specifier_string(specifier, dash + 1, userdata, ret); + + *ret = TAKE_PTR(prefix); + return 0; +} + +static int specifier_last_component_unescaped(char specifier, const void *data, const void *userdata, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + r = specifier_last_component(specifier, data, userdata, &p); + if (r < 0) + return r; + + return unit_name_unescape(p, ret); +} + +static int specifier_filename(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + + assert(u); + + if (u->instance) + return unit_name_path_unescape(u->instance, ret); + else + return unit_name_to_path(u->id, ret); +} + +static void bad_specifier(const Unit *u, char specifier) { + log_unit_warning(u, "Specifier '%%%c' used in unit configuration, which is deprecated. Please update your unit file, as it does not work as intended.", specifier); +} + +static int specifier_cgroup(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n; + + assert(u); + + bad_specifier(u, specifier); + + if (u->cgroup_path) + n = strdup(u->cgroup_path); + else + n = unit_default_cgroup_path(u); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_cgroup_root(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n; + + assert(u); + + bad_specifier(u, specifier); + + n = strdup(u->manager->cgroup_root); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_cgroup_slice(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n; + + assert(u); + + bad_specifier(u, specifier); + + if (UNIT_ISSET(u->slice)) { + const Unit *slice; + + slice = UNIT_DEREF(u->slice); + + if (slice->cgroup_path) + n = strdup(slice->cgroup_path); + else + n = unit_default_cgroup_path(slice); + } else + n = strdup(u->manager->cgroup_root); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +static int specifier_special_directory(char specifier, const void *data, const void *userdata, char **ret) { + const Unit *u = userdata; + char *n = NULL; + + assert(u); + + n = strdup(u->manager->prefix[PTR_TO_UINT(data)]); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int unit_name_printf(const Unit *u, const char* format, char **ret) { + + /* + * This will use the passed string as format string and replace the following specifiers (which should all be + * safe for inclusion in unit names): + * + * %n: the full id of the unit (foo-aaa@bar.waldo) + * %N: the id of the unit without the suffix (foo-aaa@bar) + * %p: the prefix (foo-aaa) + * %i: the instance (bar) + * %j: the last componet of the prefix (aaa) + */ + + const Specifier table[] = { + { 'i', specifier_string, u->instance }, + { 'j', specifier_last_component, NULL }, + { 'n', specifier_string, u->id }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + + COMMON_SYSTEM_SPECIFIERS, + + COMMON_CREDS_SPECIFIERS, + {} + }; + + assert(u); + assert(format); + assert(ret); + + return specifier_printf(format, table, u, ret); +} + +int unit_full_printf(const Unit *u, const char *format, char **ret) { + /* This is similar to unit_name_printf() but also supports unescaping. Also, adds a couple of additional codes + * (which are likely not suitable for unescaped inclusion in unit names): + * + * %f: the unescaped instance if set, otherwise the id unescaped as path + * + * %c: cgroup path of unit (deprecated) + * %r: where units in this slice are placed in the cgroup tree (deprecated) + * %R: the root of this systemd's instance tree (deprecated) + * + * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME) + * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME) + * %L: the log directory root (e.g. /var/log or $XDG_CONFIG_HOME/log) + * %S: the state directory root (e.g. /var/lib or $XDG_CONFIG_HOME) + * %t: the runtime directory root (e.g. /run or $XDG_RUNTIME_DIR) + * + * %h: the homedir of the running user + * %s: the shell of the running user + * + * NOTICE: When you add new entries here, please be careful: specifiers which depend on settings of the unit + * file itself are broken by design, as they would resolve differently depending on whether they are used + * before or after the relevant configuration setting. Hence: don't add them. + */ + + assert(u); + assert(format); + assert(ret); + + const Specifier table[] = { + { 'i', specifier_string, u->instance }, + { 'I', specifier_instance_unescaped, NULL }, + { 'j', specifier_last_component, NULL }, + { 'J', specifier_last_component_unescaped, NULL }, + { 'n', specifier_string, u->id }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + { 'P', specifier_prefix_unescaped, NULL }, + + { 'f', specifier_filename, NULL }, + + { 'c', specifier_cgroup, NULL }, + { 'r', specifier_cgroup_slice, NULL }, + { 'R', specifier_cgroup_root, NULL }, + + { 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) }, + { 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) }, + { 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) }, + { 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) }, + { 't', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_RUNTIME) }, + + { 'h', specifier_user_home, NULL }, + { 's', specifier_user_shell, NULL }, + + COMMON_SYSTEM_SPECIFIERS, + + COMMON_CREDS_SPECIFIERS, + + COMMON_TMP_SPECIFIERS, + {} + }; + + return specifier_printf(format, table, u, ret); +} diff --git a/src/core/unit-printf.h b/src/core/unit-printf.h new file mode 100644 index 0000000..de5183c --- /dev/null +++ b/src/core/unit-printf.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "unit.h" + +int unit_name_printf(const Unit *u, const char* text, char **ret); +int unit_full_printf(const Unit *u, const char *text, char **ret); diff --git a/src/core/unit.c b/src/core/unit.c new file mode 100644 index 0000000..45a417a --- /dev/null +++ b/src/core/unit.c @@ -0,0 +1,6368 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <stdlib.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "sd-id128.h" +#include "sd-messages.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bus-common-errors.h" +#include "bus-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "core-varlink.h" +#include "dbus-unit.h" +#include "dbus.h" +#include "dropin.h" +#include "escape.h" +#include "execute.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "fileio.h" +#include "format-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "install.h" +#include "label.h" +#include "load-dropin.h" +#include "load-fragment.h" +#include "log.h" +#include "macro.h" +#include "missing_audit.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "serialize.h" +#include "set.h" +#include "signal-util.h" +#include "sparse-endian.h" +#include "special.h" +#include "specifier.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "unit-name.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" + +/* Thresholds for logging at INFO level about resource consumption */ +#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC) +#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL) +#define MENTIONWORTHY_IP_BYTES (0ULL) + +/* Thresholds for logging at INFO level about resource consumption */ +#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */ +#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL) /* 10 MB */ +#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */ + +const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = { + [UNIT_SERVICE] = &service_vtable, + [UNIT_SOCKET] = &socket_vtable, + [UNIT_TARGET] = &target_vtable, + [UNIT_DEVICE] = &device_vtable, + [UNIT_MOUNT] = &mount_vtable, + [UNIT_AUTOMOUNT] = &automount_vtable, + [UNIT_SWAP] = &swap_vtable, + [UNIT_TIMER] = &timer_vtable, + [UNIT_PATH] = &path_vtable, + [UNIT_SLICE] = &slice_vtable, + [UNIT_SCOPE] = &scope_vtable, +}; + +static void maybe_warn_about_dependency(Unit *u, const char *other, UnitDependency dependency); + +Unit *unit_new(Manager *m, size_t size) { + Unit *u; + + assert(m); + assert(size >= sizeof(Unit)); + + u = malloc0(size); + if (!u) + return NULL; + + u->manager = m; + u->type = _UNIT_TYPE_INVALID; + u->default_dependencies = true; + u->unit_file_state = _UNIT_FILE_STATE_INVALID; + u->unit_file_preset = -1; + u->on_failure_job_mode = JOB_REPLACE; + u->cgroup_control_inotify_wd = -1; + u->cgroup_memory_inotify_wd = -1; + u->job_timeout = USEC_INFINITY; + u->job_running_timeout = USEC_INFINITY; + u->ref_uid = UID_INVALID; + u->ref_gid = GID_INVALID; + u->cpu_usage_last = NSEC_INFINITY; + u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + u->failure_action_exit_status = u->success_action_exit_status = -1; + + u->ip_accounting_ingress_map_fd = -1; + u->ip_accounting_egress_map_fd = -1; + u->ipv4_allow_map_fd = -1; + u->ipv6_allow_map_fd = -1; + u->ipv4_deny_map_fd = -1; + u->ipv6_deny_map_fd = -1; + + u->last_section_private = -1; + + u->start_ratelimit = (RateLimit) { m->default_start_limit_interval, m->default_start_limit_burst }; + u->auto_stop_ratelimit = (RateLimit) { 10 * USEC_PER_SEC, 16 }; + + for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) + u->io_accounting_last[i] = UINT64_MAX; + + return u; +} + +int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret) { + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + u = unit_new(m, size); + if (!u) + return -ENOMEM; + + r = unit_add_name(u, name); + if (r < 0) + return r; + + *ret = TAKE_PTR(u); + + return r; +} + +bool unit_has_name(const Unit *u, const char *name) { + assert(u); + assert(name); + + return streq_ptr(name, u->id) || + set_contains(u->aliases, name); +} + +static void unit_init(Unit *u) { + CGroupContext *cc; + ExecContext *ec; + KillContext *kc; + + assert(u); + assert(u->manager); + assert(u->type >= 0); + + cc = unit_get_cgroup_context(u); + if (cc) { + cgroup_context_init(cc); + + /* Copy in the manager defaults into the cgroup + * context, _before_ the rest of the settings have + * been initialized */ + + cc->cpu_accounting = u->manager->default_cpu_accounting; + cc->io_accounting = u->manager->default_io_accounting; + cc->blockio_accounting = u->manager->default_blockio_accounting; + cc->memory_accounting = u->manager->default_memory_accounting; + cc->tasks_accounting = u->manager->default_tasks_accounting; + cc->ip_accounting = u->manager->default_ip_accounting; + + if (u->type != UNIT_SLICE) + cc->tasks_max = u->manager->default_tasks_max; + } + + ec = unit_get_exec_context(u); + if (ec) { + exec_context_init(ec); + + if (MANAGER_IS_SYSTEM(u->manager)) + ec->keyring_mode = EXEC_KEYRING_SHARED; + else { + ec->keyring_mode = EXEC_KEYRING_INHERIT; + + /* User manager might have its umask redefined by PAM or UMask=. In this + * case let the units it manages inherit this value by default. They can + * still tune this value through their own unit file */ + (void) get_process_umask(getpid_cached(), &ec->umask); + } + } + + kc = unit_get_kill_context(u); + if (kc) + kill_context_init(kc); + + if (UNIT_VTABLE(u)->init) + UNIT_VTABLE(u)->init(u); +} + +static int unit_add_alias(Unit *u, char *donated_name) { + int r; + + /* Make sure that u->names is allocated. We may leave u->names + * empty if we fail later, but this is not a problem. */ + r = set_ensure_put(&u->aliases, &string_hash_ops, donated_name); + if (r < 0) + return r; + assert(r > 0); + + return 0; +} + +int unit_add_name(Unit *u, const char *text) { + _cleanup_free_ char *name = NULL, *instance = NULL; + UnitType t; + int r; + + assert(u); + assert(text); + + if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "instance is not set when adding name '%s': %m", text); + + r = unit_name_replace_instance(text, u->instance, &name); + if (r < 0) + return log_unit_debug_errno(u, r, + "failed to build instance name from '%s': %m", text); + } else { + name = strdup(text); + if (!name) + return -ENOMEM; + } + + if (unit_has_name(u, name)) + return 0; + + if (hashmap_contains(u->manager->units, name)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST), + "unit already exist when adding name '%s': %m", name); + + if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "name '%s' is invalid: %m", name); + + t = unit_name_to_type(name); + if (t < 0) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "failed to derive unit type from name '%s': %m", name); + + if (u->type != _UNIT_TYPE_INVALID && t != u->type) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m", + u->type, t, name); + + r = unit_name_to_instance(name, &instance); + if (r < 0) + return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name); + + if (instance && !unit_type_may_template(t)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name); + + /* Ensure that this unit either has no instance, or that the instance matches. */ + if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), + "cannot add name %s, the instances don't match (\"%s\" != \"%s\").", + name, instance, u->instance); + + if (u->id && !unit_type_may_alias(t)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST), + "cannot add name %s, aliases are not allowed for %s units.", + name, unit_type_to_string(t)); + + if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES) + return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m"); + + /* Add name to the global hashmap first, because that's easier to undo */ + r = hashmap_put(u->manager->units, name, u); + if (r < 0) + return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text); + + if (u->id) { + r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */ + if (r < 0) { + hashmap_remove(u->manager->units, name); + return r; + } + TAKE_PTR(name); + + } else { + /* A new name, we don't need the set yet. */ + assert(u->type == _UNIT_TYPE_INVALID); + assert(!u->instance); + + u->type = t; + u->id = TAKE_PTR(name); + u->instance = TAKE_PTR(instance); + + LIST_PREPEND(units_by_type, u->manager->units_by_type[t], u); + unit_init(u); + } + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_choose_id(Unit *u, const char *name) { + _cleanup_free_ char *t = NULL; + char *s; + int r; + + assert(u); + assert(name); + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(name, u->instance, &t); + if (r < 0) + return r; + + name = t; + } + + if (streq_ptr(u->id, name)) + return 0; /* Nothing to do. */ + + /* Selects one of the aliases of this unit as the id */ + s = set_get(u->aliases, (char*) name); + if (!s) + return -ENOENT; + + if (u->id) { + r = set_remove_and_put(u->aliases, name, u->id); + if (r < 0) + return r; + } else + assert_se(set_remove(u->aliases, name)); /* see set_get() above… */ + + u->id = s; /* Old u->id is now stored in the set, and s is not stored anywhere */ + unit_add_to_dbus_queue(u); + + return 0; +} + +int unit_set_description(Unit *u, const char *description) { + int r; + + assert(u); + + r = free_and_strdup(&u->description, empty_to_null(description)); + if (r < 0) + return r; + if (r > 0) + unit_add_to_dbus_queue(u); + + return 0; +} + +bool unit_may_gc(Unit *u) { + UnitActiveState state; + int r; + + assert(u); + + /* Checks whether the unit is ready to be unloaded for garbage collection. + * Returns true when the unit may be collected, and false if there's some + * reason to keep it loaded. + * + * References from other units are *not* checked here. Instead, this is done + * in unit_gc_sweep(), but using markers to properly collect dependency loops. + */ + + if (u->job) + return false; + + if (u->nop_job) + return false; + + state = unit_active_state(u); + + /* If the unit is inactive and failed and no job is queued for it, then release its runtime resources */ + if (UNIT_IS_INACTIVE_OR_FAILED(state) && + UNIT_VTABLE(u)->release_resources) + UNIT_VTABLE(u)->release_resources(u); + + if (u->perpetual) + return false; + + if (sd_bus_track_count(u->bus_track) > 0) + return false; + + /* But we keep the unit object around for longer when it is referenced or configured to not be gc'ed */ + switch (u->collect_mode) { + + case COLLECT_INACTIVE: + if (state != UNIT_INACTIVE) + return false; + + break; + + case COLLECT_INACTIVE_OR_FAILED: + if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED)) + return false; + + break; + + default: + assert_not_reached("Unknown garbage collection mode"); + } + + if (u->cgroup_path) { + /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay + * around. Units with active processes should never be collected. */ + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path); + if (r <= 0) + return false; + } + + if (UNIT_VTABLE(u)->may_gc && !UNIT_VTABLE(u)->may_gc(u)) + return false; + + return true; +} + +void unit_add_to_load_queue(Unit *u) { + assert(u); + assert(u->type != _UNIT_TYPE_INVALID); + + if (u->load_state != UNIT_STUB || u->in_load_queue) + return; + + LIST_PREPEND(load_queue, u->manager->load_queue, u); + u->in_load_queue = true; +} + +void unit_add_to_cleanup_queue(Unit *u) { + assert(u); + + if (u->in_cleanup_queue) + return; + + LIST_PREPEND(cleanup_queue, u->manager->cleanup_queue, u); + u->in_cleanup_queue = true; +} + +void unit_add_to_gc_queue(Unit *u) { + assert(u); + + if (u->in_gc_queue || u->in_cleanup_queue) + return; + + if (!unit_may_gc(u)) + return; + + LIST_PREPEND(gc_queue, u->manager->gc_unit_queue, u); + u->in_gc_queue = true; +} + +void unit_add_to_dbus_queue(Unit *u) { + assert(u); + assert(u->type != _UNIT_TYPE_INVALID); + + if (u->load_state == UNIT_STUB || u->in_dbus_queue) + return; + + /* Shortcut things if nobody cares */ + if (sd_bus_track_count(u->manager->subscribed) <= 0 && + sd_bus_track_count(u->bus_track) <= 0 && + set_isempty(u->manager->private_buses)) { + u->sent_dbus_new_signal = true; + return; + } + + LIST_PREPEND(dbus_queue, u->manager->dbus_unit_queue, u); + u->in_dbus_queue = true; +} + +void unit_submit_to_stop_when_unneeded_queue(Unit *u) { + assert(u); + + if (u->in_stop_when_unneeded_queue) + return; + + if (!u->stop_when_unneeded) + return; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return; + + LIST_PREPEND(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u); + u->in_stop_when_unneeded_queue = true; +} + +static void bidi_set_free(Unit *u, Hashmap *h) { + Unit *other; + void *v; + + assert(u); + + /* Frees the hashmap and makes sure we are dropped from the inverse pointers */ + + HASHMAP_FOREACH_KEY(v, other, h) { + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) + hashmap_remove(other->dependencies[d], u); + + unit_add_to_gc_queue(other); + } + + hashmap_free(h); +} + +static void unit_remove_transient(Unit *u) { + char **i; + + assert(u); + + if (!u->transient) + return; + + if (u->fragment_path) + (void) unlink(u->fragment_path); + + STRV_FOREACH(i, u->dropin_paths) { + _cleanup_free_ char *p = NULL, *pp = NULL; + + p = dirname_malloc(*i); /* Get the drop-in directory from the drop-in file */ + if (!p) + continue; + + pp = dirname_malloc(p); /* Get the config directory from the drop-in directory */ + if (!pp) + continue; + + /* Only drop transient drop-ins */ + if (!path_equal(u->manager->lookup_paths.transient, pp)) + continue; + + (void) unlink(*i); + (void) rmdir(p); + } +} + +static void unit_free_requires_mounts_for(Unit *u) { + assert(u); + + for (;;) { + _cleanup_free_ char *path; + + path = hashmap_steal_first_key(u->requires_mounts_for); + if (!path) + break; + else { + char s[strlen(path) + 1]; + + PATH_FOREACH_PREFIX_MORE(s, path) { + char *y; + Set *x; + + x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y); + if (!x) + continue; + + (void) set_remove(x, u); + + if (set_isempty(x)) { + (void) hashmap_remove(u->manager->units_requiring_mounts_for, y); + free(y); + set_free(x); + } + } + } + } + + u->requires_mounts_for = hashmap_free(u->requires_mounts_for); +} + +static void unit_done(Unit *u) { + ExecContext *ec; + CGroupContext *cc; + + assert(u); + + if (u->type < 0) + return; + + if (UNIT_VTABLE(u)->done) + UNIT_VTABLE(u)->done(u); + + ec = unit_get_exec_context(u); + if (ec) + exec_context_done(ec); + + cc = unit_get_cgroup_context(u); + if (cc) + cgroup_context_done(cc); +} + +void unit_free(Unit *u) { + char *t; + + if (!u) + return; + + u->transient_file = safe_fclose(u->transient_file); + + if (!MANAGER_IS_RELOADING(u->manager)) + unit_remove_transient(u); + + bus_unit_send_removed_signal(u); + + unit_done(u); + + unit_dequeue_rewatch_pids(u); + + sd_bus_slot_unref(u->match_bus_slot); + sd_bus_track_unref(u->bus_track); + u->deserialized_refs = strv_free(u->deserialized_refs); + u->pending_freezer_message = sd_bus_message_unref(u->pending_freezer_message); + + unit_free_requires_mounts_for(u); + + SET_FOREACH(t, u->aliases) + hashmap_remove_value(u->manager->units, t, u); + if (u->id) + hashmap_remove_value(u->manager->units, u->id, u); + + if (!sd_id128_is_null(u->invocation_id)) + hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (u->job) { + Job *j = u->job; + job_uninstall(j); + job_free(j); + } + + if (u->nop_job) { + Job *j = u->nop_job; + job_uninstall(j); + job_free(j); + } + + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) + bidi_set_free(u, u->dependencies[d]); + + /* A unit is being dropped from the tree, make sure our family is realized properly. Do this after we + * detach the unit from slice tree in order to eliminate its effect on controller masks. */ + if (UNIT_ISSET(u->slice)) + unit_add_family_to_cgroup_realize_queue(UNIT_DEREF(u->slice)); + + if (u->on_console) + manager_unref_console(u->manager); + + unit_release_cgroup(u); + + if (!MANAGER_IS_RELOADING(u->manager)) + unit_unlink_state_files(u); + + unit_unref_uid_gid(u, false); + + (void) manager_update_failed_units(u->manager, u, false); + set_remove(u->manager->startup_units, u); + + unit_unwatch_all_pids(u); + + unit_ref_unset(&u->slice); + while (u->refs_by_target) + unit_ref_unset(u->refs_by_target); + + if (u->type != _UNIT_TYPE_INVALID) + LIST_REMOVE(units_by_type, u->manager->units_by_type[u->type], u); + + if (u->in_load_queue) + LIST_REMOVE(load_queue, u->manager->load_queue, u); + + if (u->in_dbus_queue) + LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u); + + if (u->in_gc_queue) + LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u); + + if (u->in_cgroup_realize_queue) + LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u); + + if (u->in_cgroup_empty_queue) + LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); + + if (u->in_cleanup_queue) + LIST_REMOVE(cleanup_queue, u->manager->cleanup_queue, u); + + if (u->in_target_deps_queue) + LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u); + + if (u->in_stop_when_unneeded_queue) + LIST_REMOVE(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u); + + safe_close(u->ip_accounting_ingress_map_fd); + safe_close(u->ip_accounting_egress_map_fd); + + safe_close(u->ipv4_allow_map_fd); + safe_close(u->ipv6_allow_map_fd); + safe_close(u->ipv4_deny_map_fd); + safe_close(u->ipv6_deny_map_fd); + + bpf_program_unref(u->ip_bpf_ingress); + bpf_program_unref(u->ip_bpf_ingress_installed); + bpf_program_unref(u->ip_bpf_egress); + bpf_program_unref(u->ip_bpf_egress_installed); + + set_free(u->ip_bpf_custom_ingress); + set_free(u->ip_bpf_custom_egress); + set_free(u->ip_bpf_custom_ingress_installed); + set_free(u->ip_bpf_custom_egress_installed); + + bpf_program_unref(u->bpf_device_control_installed); + + condition_free_list(u->conditions); + condition_free_list(u->asserts); + + free(u->description); + strv_free(u->documentation); + free(u->fragment_path); + free(u->source_path); + strv_free(u->dropin_paths); + free(u->instance); + + free(u->job_timeout_reboot_arg); + free(u->reboot_arg); + + set_free_free(u->aliases); + free(u->id); + + free(u); +} + +FreezerState unit_freezer_state(Unit *u) { + assert(u); + + return u->freezer_state; +} + +int unit_freezer_state_kernel(Unit *u, FreezerState *ret) { + char *values[1] = {}; + int r; + + assert(u); + + r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", + STRV_MAKE("frozen"), values); + if (r < 0) + return r; + + r = _FREEZER_STATE_INVALID; + + if (values[0]) { + if (streq(values[0], "0")) + r = FREEZER_RUNNING; + else if (streq(values[0], "1")) + r = FREEZER_FROZEN; + } + + free(values[0]); + *ret = r; + + return 0; +} + +UnitActiveState unit_active_state(Unit *u) { + assert(u); + + if (u->load_state == UNIT_MERGED) + return unit_active_state(unit_follow_merge(u)); + + /* After a reload it might happen that a unit is not correctly + * loaded but still has a process around. That's why we won't + * shortcut failed loading to UNIT_INACTIVE_FAILED. */ + + return UNIT_VTABLE(u)->active_state(u); +} + +const char* unit_sub_state_to_string(Unit *u) { + assert(u); + + return UNIT_VTABLE(u)->sub_state_to_string(u); +} + +static int hashmap_complete_move(Hashmap **s, Hashmap **other) { + assert(s); + assert(other); + + if (!*other) + return 0; + + if (*s) + return hashmap_move(*s, *other); + else + *s = TAKE_PTR(*other); + + return 0; +} + +static int merge_names(Unit *u, Unit *other) { + char *name; + int r; + + assert(u); + assert(other); + + r = unit_add_alias(u, other->id); + if (r < 0) + return r; + + r = set_move(u->aliases, other->aliases); + if (r < 0) { + set_remove(u->aliases, other->id); + return r; + } + + TAKE_PTR(other->id); + other->aliases = set_free_free(other->aliases); + + SET_FOREACH(name, u->aliases) + assert_se(hashmap_replace(u->manager->units, name, u) == 0); + + return 0; +} + +static int reserve_dependencies(Unit *u, Unit *other, UnitDependency d) { + unsigned n_reserve; + + assert(u); + assert(other); + assert(d < _UNIT_DEPENDENCY_MAX); + + /* + * If u does not have this dependency set allocated, there is no need + * to reserve anything. In that case other's set will be transferred + * as a whole to u by complete_move(). + */ + if (!u->dependencies[d]) + return 0; + + /* merge_dependencies() will skip a u-on-u dependency */ + n_reserve = hashmap_size(other->dependencies[d]) - !!hashmap_get(other->dependencies[d], u); + + return hashmap_reserve(u->dependencies[d], n_reserve); +} + +static void merge_dependencies(Unit *u, Unit *other, const char *other_id, UnitDependency d) { + Unit *back; + void *v; + int r; + + /* Merges all dependencies of type 'd' of the unit 'other' into the deps of the unit 'u' */ + + assert(u); + assert(other); + assert(d < _UNIT_DEPENDENCY_MAX); + + /* Fix backwards pointers. Let's iterate through all dependent units of the other unit. */ + HASHMAP_FOREACH_KEY(v, back, other->dependencies[d]) + + /* Let's now iterate through the dependencies of that dependencies of the other units, + * looking for pointers back, and let's fix them up, to instead point to 'u'. */ + for (UnitDependency k = 0; k < _UNIT_DEPENDENCY_MAX; k++) + if (back == u) { + /* Do not add dependencies between u and itself. */ + if (hashmap_remove(back->dependencies[k], other)) + maybe_warn_about_dependency(u, other_id, k); + } else { + UnitDependencyInfo di_u, di_other; + + /* Let's drop this dependency between "back" and "other", and let's create it between + * "back" and "u" instead. Let's merge the bit masks of the dependency we are moving, + * and any such dependency which might already exist */ + + di_other.data = hashmap_get(back->dependencies[k], other); + if (!di_other.data) + continue; /* dependency isn't set, let's try the next one */ + + di_u.data = hashmap_get(back->dependencies[k], u); + + UnitDependencyInfo di_merged = { + .origin_mask = di_u.origin_mask | di_other.origin_mask, + .destination_mask = di_u.destination_mask | di_other.destination_mask, + }; + + r = hashmap_remove_and_replace(back->dependencies[k], other, u, di_merged.data); + if (r < 0) + log_warning_errno(r, "Failed to remove/replace: back=%s other=%s u=%s: %m", back->id, other_id, u->id); + assert(r >= 0); + + /* assert_se(hashmap_remove_and_replace(back->dependencies[k], other, u, di_merged.data) >= 0); */ + } + + /* Also do not move dependencies on u to itself */ + back = hashmap_remove(other->dependencies[d], u); + if (back) + maybe_warn_about_dependency(u, other_id, d); + + /* The move cannot fail. The caller must have performed a reservation. */ + assert_se(hashmap_complete_move(&u->dependencies[d], &other->dependencies[d]) == 0); + + other->dependencies[d] = hashmap_free(other->dependencies[d]); +} + +int unit_merge(Unit *u, Unit *other) { + const char *other_id = NULL; + int r; + + assert(u); + assert(other); + assert(u->manager == other->manager); + assert(u->type != _UNIT_TYPE_INVALID); + + other = unit_follow_merge(other); + + if (other == u) + return 0; + + if (u->type != other->type) + return -EINVAL; + + if (!unit_type_may_alias(u->type)) /* Merging only applies to unit names that support aliases */ + return -EEXIST; + + if (!IN_SET(other->load_state, UNIT_STUB, UNIT_NOT_FOUND)) + return -EEXIST; + + if (!streq_ptr(u->instance, other->instance)) + return -EINVAL; + + if (other->job) + return -EEXIST; + + if (other->nop_job) + return -EEXIST; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + return -EEXIST; + + if (other->id) + other_id = strdupa(other->id); + + /* Make reservations to ensure merge_dependencies() won't fail */ + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + r = reserve_dependencies(u, other, d); + /* + * We don't rollback reservations if we fail. We don't have + * a way to undo reservations. A reservation is not a leak. + */ + if (r < 0) + return r; + } + + /* Merge names */ + r = merge_names(u, other); + if (r < 0) + return r; + + /* Redirect all references */ + while (other->refs_by_target) + unit_ref_set(other->refs_by_target, other->refs_by_target->source, u); + + /* Merge dependencies */ + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) + merge_dependencies(u, other, other_id, d); + + other->load_state = UNIT_MERGED; + other->merged_into = u; + + /* If there is still some data attached to the other node, we + * don't need it anymore, and can free it. */ + if (other->load_state != UNIT_STUB) + if (UNIT_VTABLE(other)->done) + UNIT_VTABLE(other)->done(other); + + unit_add_to_dbus_queue(u); + unit_add_to_cleanup_queue(other); + + return 0; +} + +int unit_merge_by_name(Unit *u, const char *name) { + _cleanup_free_ char *s = NULL; + Unit *other; + int r; + + /* Either add name to u, or if a unit with name already exists, merge it with u. + * If name is a template, do the same for name@instance, where instance is u's instance. */ + + assert(u); + assert(name); + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + if (!u->instance) + return -EINVAL; + + r = unit_name_replace_instance(name, u->instance, &s); + if (r < 0) + return r; + + name = s; + } + + other = manager_get_unit(u->manager, name); + if (other) + return unit_merge(u, other); + + return unit_add_name(u, name); +} + +Unit* unit_follow_merge(Unit *u) { + assert(u); + + while (u->load_state == UNIT_MERGED) + assert_se(u = u->merged_into); + + return u; +} + +int unit_add_exec_dependencies(Unit *u, ExecContext *c) { + int r; + + assert(u); + assert(c); + + if (c->working_directory && !c->working_directory_missing_ok) { + r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_directory) { + r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_image) { + r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + if (!u->manager->prefix[dt]) + continue; + + char **dp; + STRV_FOREACH(dp, c->directories[dt].paths) { + _cleanup_free_ char *p; + + p = path_join(u->manager->prefix[dt], *dp); + if (!p) + return -ENOMEM; + + r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + } + + if (!MANAGER_IS_SYSTEM(u->manager)) + return 0; + + /* For the following three directory types we need write access, and /var/ is possibly on the root + * fs. Hence order after systemd-remount-fs.service, to ensure things are writable. */ + if (!strv_isempty(c->directories[EXEC_DIRECTORY_STATE].paths) || + !strv_isempty(c->directories[EXEC_DIRECTORY_CACHE].paths) || + !strv_isempty(c->directories[EXEC_DIRECTORY_LOGS].paths)) { + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->private_tmp) { + const char *p; + + FOREACH_STRING(p, "/tmp", "/var/tmp") { + r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_TMPFILES_SETUP_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (c->root_image) { + /* We need to wait for /dev/loopX to appear when doing RootImage=, hence let's add an + * implicit dependency on udev */ + + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_UDEVD_SERVICE, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + + if (!IN_SET(c->std_output, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) && + !IN_SET(c->std_error, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) && + !c->log_namespace) + return 0; + + /* If syslog or kernel logging is requested (or log namespacing is), make sure our own logging daemon + * is run first. */ + + if (c->log_namespace) { + _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL; + + r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit); + if (r < 0) + return r; + + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } else + r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + return 0; +} + +const char *unit_description(Unit *u) { + assert(u); + + if (u->description) + return u->description; + + return strna(u->id); +} + +const char *unit_status_string(Unit *u) { + assert(u); + + if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_NAME && u->id) + return u->id; + + return unit_description(u); +} + +static void print_unit_dependency_mask(FILE *f, const char *kind, UnitDependencyMask mask, bool *space) { + const struct { + UnitDependencyMask mask; + const char *name; + } table[] = { + { UNIT_DEPENDENCY_FILE, "file" }, + { UNIT_DEPENDENCY_IMPLICIT, "implicit" }, + { UNIT_DEPENDENCY_DEFAULT, "default" }, + { UNIT_DEPENDENCY_UDEV, "udev" }, + { UNIT_DEPENDENCY_PATH, "path" }, + { UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT, "mountinfo-implicit" }, + { UNIT_DEPENDENCY_MOUNTINFO_DEFAULT, "mountinfo-default" }, + { UNIT_DEPENDENCY_PROC_SWAP, "proc-swap" }, + }; + + assert(f); + assert(kind); + assert(space); + + for (size_t i = 0; i < ELEMENTSOF(table); i++) { + + if (mask == 0) + break; + + if (FLAGS_SET(mask, table[i].mask)) { + if (*space) + fputc(' ', f); + else + *space = true; + + fputs(kind, f); + fputs("-", f); + fputs(table[i].name, f); + + mask &= ~table[i].mask; + } + } + + assert(mask == 0); +} + +void unit_dump(Unit *u, FILE *f, const char *prefix) { + char *t, **j; + const char *prefix2; + char timestamp[5][FORMAT_TIMESTAMP_MAX], timespan[FORMAT_TIMESPAN_MAX]; + Unit *following; + _cleanup_set_free_ Set *following_set = NULL; + CGroupMask m; + int r; + + assert(u); + assert(u->type >= 0); + + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + + fprintf(f, + "%s-> Unit %s:\n", + prefix, u->id); + + SET_FOREACH(t, u->aliases) + fprintf(f, "%s\tAlias: %s\n", prefix, t); + + fprintf(f, + "%s\tDescription: %s\n" + "%s\tInstance: %s\n" + "%s\tUnit Load State: %s\n" + "%s\tUnit Active State: %s\n" + "%s\tState Change Timestamp: %s\n" + "%s\tInactive Exit Timestamp: %s\n" + "%s\tActive Enter Timestamp: %s\n" + "%s\tActive Exit Timestamp: %s\n" + "%s\tInactive Enter Timestamp: %s\n" + "%s\tMay GC: %s\n" + "%s\tNeed Daemon Reload: %s\n" + "%s\tTransient: %s\n" + "%s\tPerpetual: %s\n" + "%s\tGarbage Collection Mode: %s\n" + "%s\tSlice: %s\n" + "%s\tCGroup: %s\n" + "%s\tCGroup realized: %s\n", + prefix, unit_description(u), + prefix, strna(u->instance), + prefix, unit_load_state_to_string(u->load_state), + prefix, unit_active_state_to_string(unit_active_state(u)), + prefix, strna(format_timestamp(timestamp[0], sizeof(timestamp[0]), u->state_change_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp[1], sizeof(timestamp[1]), u->inactive_exit_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp[2], sizeof(timestamp[2]), u->active_enter_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp[3], sizeof(timestamp[3]), u->active_exit_timestamp.realtime)), + prefix, strna(format_timestamp(timestamp[4], sizeof(timestamp[4]), u->inactive_enter_timestamp.realtime)), + prefix, yes_no(unit_may_gc(u)), + prefix, yes_no(unit_need_daemon_reload(u)), + prefix, yes_no(u->transient), + prefix, yes_no(u->perpetual), + prefix, collect_mode_to_string(u->collect_mode), + prefix, strna(unit_slice_name(u)), + prefix, strna(u->cgroup_path), + prefix, yes_no(u->cgroup_realized)); + + if (u->cgroup_realized_mask != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(u->cgroup_realized_mask, &s); + fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s)); + } + + if (u->cgroup_enabled_mask != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(u->cgroup_enabled_mask, &s); + fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_own_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_members_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s)); + } + + m = unit_get_delegate_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s)); + } + + if (!sd_id128_is_null(u->invocation_id)) + fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n", + prefix, SD_ID128_FORMAT_VAL(u->invocation_id)); + + STRV_FOREACH(j, u->documentation) + fprintf(f, "%s\tDocumentation: %s\n", prefix, *j); + + following = unit_following(u); + if (following) + fprintf(f, "%s\tFollowing: %s\n", prefix, following->id); + + r = unit_following_set(u, &following_set); + if (r >= 0) { + Unit *other; + + SET_FOREACH(other, following_set) + fprintf(f, "%s\tFollowing Set Member: %s\n", prefix, other->id); + } + + if (u->fragment_path) + fprintf(f, "%s\tFragment Path: %s\n", prefix, u->fragment_path); + + if (u->source_path) + fprintf(f, "%s\tSource Path: %s\n", prefix, u->source_path); + + STRV_FOREACH(j, u->dropin_paths) + fprintf(f, "%s\tDropIn Path: %s\n", prefix, *j); + + if (u->failure_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tFailure Action: %s\n", prefix, emergency_action_to_string(u->failure_action)); + if (u->failure_action_exit_status >= 0) + fprintf(f, "%s\tFailure Action Exit Status: %i\n", prefix, u->failure_action_exit_status); + if (u->success_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tSuccess Action: %s\n", prefix, emergency_action_to_string(u->success_action)); + if (u->success_action_exit_status >= 0) + fprintf(f, "%s\tSuccess Action Exit Status: %i\n", prefix, u->success_action_exit_status); + + if (u->job_timeout != USEC_INFINITY) + fprintf(f, "%s\tJob Timeout: %s\n", prefix, format_timespan(timespan, sizeof(timespan), u->job_timeout, 0)); + + if (u->job_timeout_action != EMERGENCY_ACTION_NONE) + fprintf(f, "%s\tJob Timeout Action: %s\n", prefix, emergency_action_to_string(u->job_timeout_action)); + + if (u->job_timeout_reboot_arg) + fprintf(f, "%s\tJob Timeout Reboot Argument: %s\n", prefix, u->job_timeout_reboot_arg); + + condition_dump_list(u->conditions, f, prefix, condition_type_to_string); + condition_dump_list(u->asserts, f, prefix, assert_type_to_string); + + if (dual_timestamp_is_set(&u->condition_timestamp)) + fprintf(f, + "%s\tCondition Timestamp: %s\n" + "%s\tCondition Result: %s\n", + prefix, strna(format_timestamp(timestamp[0], sizeof(timestamp[0]), u->condition_timestamp.realtime)), + prefix, yes_no(u->condition_result)); + + if (dual_timestamp_is_set(&u->assert_timestamp)) + fprintf(f, + "%s\tAssert Timestamp: %s\n" + "%s\tAssert Result: %s\n", + prefix, strna(format_timestamp(timestamp[0], sizeof(timestamp[0]), u->assert_timestamp.realtime)), + prefix, yes_no(u->assert_result)); + + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + UnitDependencyInfo di; + Unit *other; + + HASHMAP_FOREACH_KEY(di.data, other, u->dependencies[d]) { + bool space = false; + + fprintf(f, "%s\t%s: %s (", prefix, unit_dependency_to_string(d), other->id); + + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + + fputs(")\n", f); + } + } + + if (!hashmap_isempty(u->requires_mounts_for)) { + UnitDependencyInfo di; + const char *path; + + HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) { + bool space = false; + + fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path); + + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + + fputs(")\n", f); + } + } + + if (u->load_state == UNIT_LOADED) { + + fprintf(f, + "%s\tStopWhenUnneeded: %s\n" + "%s\tRefuseManualStart: %s\n" + "%s\tRefuseManualStop: %s\n" + "%s\tDefaultDependencies: %s\n" + "%s\tOnFailureJobMode: %s\n" + "%s\tIgnoreOnIsolate: %s\n", + prefix, yes_no(u->stop_when_unneeded), + prefix, yes_no(u->refuse_manual_start), + prefix, yes_no(u->refuse_manual_stop), + prefix, yes_no(u->default_dependencies), + prefix, job_mode_to_string(u->on_failure_job_mode), + prefix, yes_no(u->ignore_on_isolate)); + + if (UNIT_VTABLE(u)->dump) + UNIT_VTABLE(u)->dump(u, f, prefix2); + + } else if (u->load_state == UNIT_MERGED) + fprintf(f, + "%s\tMerged into: %s\n", + prefix, u->merged_into->id); + else if (u->load_state == UNIT_ERROR) + fprintf(f, "%s\tLoad Error Code: %s\n", prefix, strerror_safe(u->load_error)); + + for (const char *n = sd_bus_track_first(u->bus_track); n; n = sd_bus_track_next(u->bus_track)) + fprintf(f, "%s\tBus Ref: %s\n", prefix, n); + + if (u->job) + job_dump(u->job, f, prefix2); + + if (u->nop_job) + job_dump(u->nop_job, f, prefix2); +} + +/* Common implementation for multiple backends */ +int unit_load_fragment_and_dropin(Unit *u, bool fragment_required) { + int r; + + assert(u); + + /* Load a .{service,socket,...} file */ + r = unit_load_fragment(u); + if (r < 0) + return r; + + if (u->load_state == UNIT_STUB) { + if (fragment_required) + return -ENOENT; + + u->load_state = UNIT_LOADED; + } + + /* Load drop-in directory data. If u is an alias, we might be reloading the + * target unit needlessly. But we cannot be sure which drops-ins have already + * been loaded and which not, at least without doing complicated book-keeping, + * so let's always reread all drop-ins. */ + r = unit_load_dropin(unit_follow_merge(u)); + if (r < 0) + return r; + + if (u->source_path) { + struct stat st; + + if (stat(u->source_path, &st) >= 0) + u->source_mtime = timespec_load(&st.st_mtim); + else + u->source_mtime = 0; + } + + return 0; +} + +void unit_add_to_target_deps_queue(Unit *u) { + Manager *m = u->manager; + + assert(u); + + if (u->in_target_deps_queue) + return; + + LIST_PREPEND(target_deps_queue, m->target_deps_queue, u); + u->in_target_deps_queue = true; +} + +int unit_add_default_target_dependency(Unit *u, Unit *target) { + assert(u); + assert(target); + + if (target->type != UNIT_TARGET) + return 0; + + /* Only add the dependency if both units are loaded, so that + * that loop check below is reliable */ + if (u->load_state != UNIT_LOADED || + target->load_state != UNIT_LOADED) + return 0; + + /* If either side wants no automatic dependencies, then let's + * skip this */ + if (!u->default_dependencies || + !target->default_dependencies) + return 0; + + /* Don't create loops */ + if (hashmap_get(target->dependencies[UNIT_BEFORE], u)) + return 0; + + return unit_add_dependency(target, UNIT_AFTER, u, true, UNIT_DEPENDENCY_DEFAULT); +} + +static int unit_add_slice_dependencies(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + /* Slice units are implicitly ordered against their parent slices (as this relationship is encoded in the + name), while all other units are ordered based on configuration (as in their case Slice= configures the + relationship). */ + UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE; + + if (UNIT_ISSET(u->slice)) + return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, UNIT_DEREF(u->slice), true, mask); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return 0; + + return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, SPECIAL_ROOT_SLICE, true, mask); +} + +static int unit_add_mount_dependencies(Unit *u) { + UnitDependencyInfo di; + const char *path; + int r; + + assert(u); + + HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) { + char prefix[strlen(path) + 1]; + + PATH_FOREACH_PREFIX_MORE(prefix, path) { + _cleanup_free_ char *p = NULL; + Unit *m; + + r = unit_name_from_path(prefix, ".mount", &p); + if (r < 0) + return r; + + m = manager_get_unit(u->manager, p); + if (!m) { + /* Make sure to load the mount unit if + * it exists. If so the dependencies + * on this unit will be added later + * during the loading of the mount + * unit. */ + (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m); + continue; + } + if (m == u) + continue; + + if (m->load_state != UNIT_LOADED) + continue; + + r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask); + if (r < 0) + return r; + + if (m->fragment_path) { + r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask); + if (r < 0) + return r; + } + } + } + + return 0; +} + +static int unit_add_oomd_dependencies(Unit *u) { + CGroupContext *c; + bool wants_oomd; + int r; + + assert(u); + + if (!u->default_dependencies) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + wants_oomd = (c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL); + if (!wants_oomd) + return 0; + + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + return 0; +} + +static int unit_add_startup_units(Unit *u) { + CGroupContext *c; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + if (c->startup_cpu_shares == CGROUP_CPU_SHARES_INVALID && + c->startup_io_weight == CGROUP_WEIGHT_INVALID && + c->startup_blockio_weight == CGROUP_BLKIO_WEIGHT_INVALID) + return 0; + + return set_ensure_put(&u->manager->startup_units, NULL, u); +} + +int unit_load(Unit *u) { + int r; + + assert(u); + + if (u->in_load_queue) { + LIST_REMOVE(load_queue, u->manager->load_queue, u); + u->in_load_queue = false; + } + + if (u->type == _UNIT_TYPE_INVALID) + return -EINVAL; + + if (u->load_state != UNIT_STUB) + return 0; + + if (u->transient_file) { + /* Finalize transient file: if this is a transient unit file, as soon as we reach unit_load() the setup + * is complete, hence let's synchronize the unit file we just wrote to disk. */ + + r = fflush_and_check(u->transient_file); + if (r < 0) + goto fail; + + u->transient_file = safe_fclose(u->transient_file); + u->fragment_mtime = now(CLOCK_REALTIME); + } + + r = UNIT_VTABLE(u)->load(u); + if (r < 0) + goto fail; + + assert(u->load_state != UNIT_STUB); + + if (u->load_state == UNIT_LOADED) { + unit_add_to_target_deps_queue(u); + + r = unit_add_slice_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_mount_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_oomd_dependencies(u); + if (r < 0) + goto fail; + + r = unit_add_startup_units(u); + if (r < 0) + goto fail; + + if (u->on_failure_job_mode == JOB_ISOLATE && hashmap_size(u->dependencies[UNIT_ON_FAILURE]) > 1) { + log_unit_error(u, "More than one OnFailure= dependencies specified but OnFailureJobMode=isolate set. Refusing."); + r = -ENOEXEC; + goto fail; + } + + if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout) + log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect."); + + /* We finished loading, let's ensure our parents recalculate the members mask */ + unit_invalidate_cgroup_members_masks(u); + } + + assert((u->load_state != UNIT_MERGED) == !u->merged_into); + + unit_add_to_dbus_queue(unit_follow_merge(u)); + unit_add_to_gc_queue(u); + (void) manager_varlink_send_managed_oom_update(u); + + return 0; + +fail: + /* We convert ENOEXEC errors to the UNIT_BAD_SETTING load state here. Configuration parsing code + * should hence return ENOEXEC to ensure units are placed in this state after loading. */ + + u->load_state = u->load_state == UNIT_STUB ? UNIT_NOT_FOUND : + r == -ENOEXEC ? UNIT_BAD_SETTING : + UNIT_ERROR; + u->load_error = r; + + /* Record the timestamp on the cache, so that if the cache gets updated between now and the next time + * an attempt is made to load this unit, we know we need to check again. */ + if (u->load_state == UNIT_NOT_FOUND) + u->fragment_not_found_timestamp_hash = u->manager->unit_cache_timestamp_hash; + + unit_add_to_dbus_queue(u); + unit_add_to_gc_queue(u); + + return log_unit_debug_errno(u, r, "Failed to load configuration: %m"); +} + +_printf_(7, 8) +static int log_unit_internal(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) { + Unit *u = userdata; + va_list ap; + int r; + + va_start(ap, format); + if (u) + r = log_object_internalv(level, error, file, line, func, + u->manager->unit_log_field, + u->id, + u->manager->invocation_log_field, + u->invocation_id_string, + format, ap); + else + r = log_internalv(level, error, file, line, func, format, ap); + va_end(ap); + + return r; +} + +static bool unit_test_condition(Unit *u) { + _cleanup_strv_free_ char **env = NULL; + int r; + + assert(u); + + dual_timestamp_get(&u->condition_timestamp); + + r = manager_get_effective_environment(u->manager, &env); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to determine effective environment: %m"); + u->condition_result = CONDITION_ERROR; + } else + u->condition_result = condition_test_list( + u->conditions, + env, + condition_type_to_string, + log_unit_internal, + u); + + unit_add_to_dbus_queue(u); + return u->condition_result; +} + +static bool unit_test_assert(Unit *u) { + _cleanup_strv_free_ char **env = NULL; + int r; + + assert(u); + + dual_timestamp_get(&u->assert_timestamp); + + r = manager_get_effective_environment(u->manager, &env); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to determine effective environment: %m"); + u->assert_result = CONDITION_ERROR; + } else + u->assert_result = condition_test_list( + u->asserts, + env, + assert_type_to_string, + log_unit_internal, + u); + + unit_add_to_dbus_queue(u); + return u->assert_result; +} + +void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *unit_status_msg_format) { + const char *d; + + d = unit_status_string(u); + if (log_get_show_color()) + d = strjoina(ANSI_HIGHLIGHT, d, ANSI_NORMAL); + + DISABLE_WARNING_FORMAT_NONLITERAL; + manager_status_printf(u->manager, status_type, status, unit_status_msg_format, d); + REENABLE_WARNING; +} + +int unit_test_start_limit(Unit *u) { + const char *reason; + + assert(u); + + if (ratelimit_below(&u->start_ratelimit)) { + u->start_limit_hit = false; + return 0; + } + + log_unit_warning(u, "Start request repeated too quickly."); + u->start_limit_hit = true; + + reason = strjoina("unit ", u->id, " failed"); + + emergency_action(u->manager, u->start_limit_action, + EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN, + u->reboot_arg, -1, reason); + + return -ECANCELED; +} + +bool unit_shall_confirm_spawn(Unit *u) { + assert(u); + + if (manager_is_confirm_spawn_disabled(u->manager)) + return false; + + /* For some reasons units remaining in the same process group + * as PID 1 fail to acquire the console even if it's not used + * by any process. So skip the confirmation question for them. */ + return !unit_get_exec_context(u)->same_pgrp; +} + +static bool unit_verify_deps(Unit *u) { + Unit *other; + void *v; + + assert(u); + + /* Checks whether all BindsTo= dependencies of this unit are fulfilled — if they are also combined with + * After=. We do not check Requires= or Requisite= here as they only should have an effect on the job + * processing, but do not have any effect afterwards. We don't check BindsTo= dependencies that are not used in + * conjunction with After= as for them any such check would make things entirely racy. */ + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BINDS_TO]) { + + if (!hashmap_contains(u->dependencies[UNIT_AFTER], other)) + continue; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) { + log_unit_notice(u, "Bound to unit %s, but unit isn't active.", other->id); + return false; + } + } + + return true; +} + +/* Errors that aren't really errors: + * -EALREADY: Unit is already started. + * -ECOMM: Condition failed + * -EAGAIN: An operation is already in progress. Retry later. + * + * Errors that are real errors: + * -EBADR: This unit type does not support starting. + * -ECANCELED: Start limit hit, too many requests for now + * -EPROTO: Assert failed + * -EINVAL: Unit not loaded + * -EOPNOTSUPP: Unit type not supported + * -ENOLINK: The necessary dependencies are not fulfilled. + * -ESTALE: This unit has been started before and can't be started a second time + * -ENOENT: This is a triggering unit and unit to trigger is not loaded + */ +int unit_start(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + /* If this is already started, then this will succeed. Note that this will even succeed if this unit + * is not startable by the user. This is relied on to detect when we need to wait for units and when + * waiting is finished. */ + state = unit_active_state(u); + if (UNIT_IS_ACTIVE_OR_RELOADING(state)) + return -EALREADY; + if (state == UNIT_MAINTENANCE) + return -EAGAIN; + + /* Units that aren't loaded cannot be started */ + if (u->load_state != UNIT_LOADED) + return -EINVAL; + + /* Refuse starting scope units more than once */ + if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_enter_timestamp)) + return -ESTALE; + + /* If the conditions failed, don't do anything at all. If we already are activating this call might + * still be useful to speed up activation in case there is some hold-off time, but we don't want to + * recheck the condition in that case. */ + if (state != UNIT_ACTIVATING && + !unit_test_condition(u)) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(ECOMM), "Starting requested but condition failed. Not starting unit."); + + /* If the asserts failed, fail the entire job */ + if (state != UNIT_ACTIVATING && + !unit_test_assert(u)) + return log_unit_notice_errno(u, SYNTHETIC_ERRNO(EPROTO), "Starting requested but asserts failed."); + + /* Units of types that aren't supported cannot be started. Note that we do this test only after the + * condition checks, so that we rather return condition check errors (which are usually not + * considered a true failure) than "not supported" errors (which are considered a failure). + */ + if (!unit_type_supported(u->type)) + return -EOPNOTSUPP; + + /* Let's make sure that the deps really are in order before we start this. Normally the job engine + * should have taken care of this already, but let's check this here again. After all, our + * dependencies might not be in effect anymore, due to a reload or due to a failed condition. */ + if (!unit_verify_deps(u)) + return -ENOLINK; + + /* Forward to the main object, if we aren't it. */ + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting start request from %s to %s.", u->id, following->id); + return unit_start(following); + } + + /* If it is stopped, but we cannot start it, then fail */ + if (!UNIT_VTABLE(u)->start) + return -EBADR; + + /* We don't suppress calls to ->start() here when we are already starting, to allow this request to + * be used as a "hurry up" call, for example when the unit is in some "auto restart" state where it + * waits for a holdoff timer to elapse before it will start again. */ + + unit_add_to_dbus_queue(u); + unit_cgroup_freezer_action(u, FREEZER_THAW); + + return UNIT_VTABLE(u)->start(u); +} + +bool unit_can_start(Unit *u) { + assert(u); + + if (u->load_state != UNIT_LOADED) + return false; + + if (!unit_type_supported(u->type)) + return false; + + /* Scope units may be started only once */ + if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_exit_timestamp)) + return false; + + return !!UNIT_VTABLE(u)->start; +} + +bool unit_can_isolate(Unit *u) { + assert(u); + + return unit_can_start(u) && + u->allow_isolate; +} + +/* Errors: + * -EBADR: This unit type does not support stopping. + * -EALREADY: Unit is already stopped. + * -EAGAIN: An operation is already in progress. Retry later. + */ +int unit_stop(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + state = unit_active_state(u); + if (UNIT_IS_INACTIVE_OR_FAILED(state)) + return -EALREADY; + + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting stop request from %s to %s.", u->id, following->id); + return unit_stop(following); + } + + if (!UNIT_VTABLE(u)->stop) + return -EBADR; + + unit_add_to_dbus_queue(u); + unit_cgroup_freezer_action(u, FREEZER_THAW); + + return UNIT_VTABLE(u)->stop(u); +} + +bool unit_can_stop(Unit *u) { + assert(u); + + /* Note: if we return true here, it does not mean that the unit may be successfully stopped. + * Extrinsic units follow external state and they may stop following external state changes + * (hence we return true here), but an attempt to do this through the manager will fail. */ + + if (!unit_type_supported(u->type)) + return false; + + if (u->perpetual) + return false; + + return !!UNIT_VTABLE(u)->stop; +} + +/* Errors: + * -EBADR: This unit type does not support reloading. + * -ENOEXEC: Unit is not started. + * -EAGAIN: An operation is already in progress. Retry later. + */ +int unit_reload(Unit *u) { + UnitActiveState state; + Unit *following; + + assert(u); + + if (u->load_state != UNIT_LOADED) + return -EINVAL; + + if (!unit_can_reload(u)) + return -EBADR; + + state = unit_active_state(u); + if (state == UNIT_RELOADING) + return -EAGAIN; + + if (state != UNIT_ACTIVE) { + log_unit_warning(u, "Unit cannot be reloaded because it is inactive."); + return -ENOEXEC; + } + + following = unit_following(u); + if (following) { + log_unit_debug(u, "Redirecting reload request from %s to %s.", u->id, following->id); + return unit_reload(following); + } + + unit_add_to_dbus_queue(u); + + if (!UNIT_VTABLE(u)->reload) { + /* Unit doesn't have a reload function, but we need to propagate the reload anyway */ + unit_notify(u, unit_active_state(u), unit_active_state(u), 0); + return 0; + } + + unit_cgroup_freezer_action(u, FREEZER_THAW); + + return UNIT_VTABLE(u)->reload(u); +} + +bool unit_can_reload(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->can_reload) + return UNIT_VTABLE(u)->can_reload(u); + + if (!hashmap_isempty(u->dependencies[UNIT_PROPAGATES_RELOAD_TO])) + return true; + + return UNIT_VTABLE(u)->reload; +} + +bool unit_is_unneeded(Unit *u) { + static const UnitDependency deps[] = { + UNIT_REQUIRED_BY, + UNIT_REQUISITE_OF, + UNIT_WANTED_BY, + UNIT_BOUND_BY, + }; + + assert(u); + + if (!u->stop_when_unneeded) + return false; + + /* Don't clean up while the unit is transitioning or is even inactive. */ + if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u))) + return false; + if (u->job) + return false; + + for (size_t j = 0; j < ELEMENTSOF(deps); j++) { + Unit *other; + void *v; + + /* If a dependent unit has a job queued, is active or transitioning, or is marked for + * restart, then don't clean this one up. */ + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[deps[j]]) { + if (other->job) + return false; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + return false; + + if (unit_will_restart(other)) + return false; + } + } + + return true; +} + +static void check_unneeded_dependencies(Unit *u) { + + static const UnitDependency deps[] = { + UNIT_REQUIRES, + UNIT_REQUISITE, + UNIT_WANTS, + UNIT_BINDS_TO, + }; + + assert(u); + + /* Add all units this unit depends on to the queue that processes StopWhenUnneeded= behaviour. */ + + for (size_t j = 0; j < ELEMENTSOF(deps); j++) { + Unit *other; + void *v; + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[deps[j]]) + unit_submit_to_stop_when_unneeded_queue(other); + } +} + +static void unit_check_binds_to(Unit *u) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool stop = false; + Unit *other; + void *v; + int r; + + assert(u); + + if (u->job) + return; + + if (unit_active_state(u) != UNIT_ACTIVE) + return; + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BINDS_TO]) { + if (other->job) + continue; + + if (!other->coldplugged) + /* We might yet create a job for the other unit… */ + continue; + + if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) + continue; + + stop = true; + break; + } + + if (!stop) + return; + + /* If stopping a unit fails continuously we might enter a stop + * loop here, hence stop acting on the service being + * unnecessary after a while. */ + if (!ratelimit_below(&u->auto_stop_ratelimit)) { + log_unit_warning(u, "Unit is bound to inactive unit %s, but not stopping since we tried this too often recently.", other->id); + return; + } + + assert(other); + log_unit_info(u, "Unit is bound to inactive unit %s. Stopping, too.", other->id); + + /* A unit we need to run is gone. Sniff. Let's stop this. */ + r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r)); +} + +static void retroactively_start_dependencies(Unit *u) { + Unit *other; + void *v; + + assert(u); + assert(UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u))); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_REQUIRES]) + if (!hashmap_get(u->dependencies[UNIT_AFTER], other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BINDS_TO]) + if (!hashmap_get(u->dependencies[UNIT_AFTER], other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_WANTS]) + if (!hashmap_get(u->dependencies[UNIT_AFTER], other) && + !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_CONFLICTS]) + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_CONFLICTED_BY]) + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); +} + +static void retroactively_stop_dependencies(Unit *u) { + Unit *other; + void *v; + + assert(u); + assert(UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u))); + + /* Pull down units which are bound to us recursively if enabled */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_BOUND_BY]) + if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) + manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); +} + +void unit_start_on_failure(Unit *u) { + Unit *other; + void *v; + int r; + + assert(u); + + if (hashmap_size(u->dependencies[UNIT_ON_FAILURE]) <= 0) + return; + + log_unit_info(u, "Triggering OnFailure= dependencies."); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_ON_FAILURE]) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = manager_add_job(u->manager, JOB_START, other, u->on_failure_job_mode, NULL, &error, NULL); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to enqueue OnFailure= job, ignoring: %s", bus_error_message(&error, r)); + } +} + +void unit_trigger_notify(Unit *u) { + Unit *other; + void *v; + + assert(u); + + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_TRIGGERED_BY]) + if (UNIT_VTABLE(other)->trigger_notify) + UNIT_VTABLE(other)->trigger_notify(other, u); +} + +static int raise_level(int log_level, bool condition_info, bool condition_notice) { + if (condition_notice && log_level > LOG_NOTICE) + return LOG_NOTICE; + if (condition_info && log_level > LOG_INFO) + return LOG_INFO; + return log_level; +} + +static int unit_log_resources(Unit *u) { + struct iovec iovec[1 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4]; + bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false; + _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL; + int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */ + size_t n_message_parts = 0, n_iovec = 0; + char* message_parts[1 + 2 + 2 + 1], *t; + nsec_t nsec = NSEC_INFINITY; + int r; + const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES", + [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS", + [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES", + [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS", + }; + const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "IO_METRIC_READ_BYTES", + [CGROUP_IO_WRITE_BYTES] = "IO_METRIC_WRITE_BYTES", + [CGROUP_IO_READ_OPERATIONS] = "IO_METRIC_READ_OPERATIONS", + [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS", + }; + + assert(u); + + /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource + * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced + * information and the complete data in structured fields. */ + + (void) unit_get_cpu_usage(u, &nsec); + if (nsec != NSEC_INFINITY) { + char buf[FORMAT_TIMESPAN_MAX] = ""; + + /* Format the CPU time for inclusion in the structured log message */ + if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the CPU time for inclusion in the human language message string */ + format_timespan(buf, sizeof(buf), nsec / NSEC_PER_USEC, USEC_PER_MSEC); + t = strjoin("consumed ", buf, " CPU time"); + if (!t) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = t; + + log_level = raise_level(log_level, + nsec > NOTICEWORTHY_CPU_NSEC, + nsec > MENTIONWORTHY_CPU_NSEC); + } + + for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) { + char buf[FORMAT_BYTES_MAX] = ""; + uint64_t value = UINT64_MAX; + + assert(io_fields[k]); + + (void) unit_get_io_accounting(u, k, k > 0, &value); + if (value == UINT64_MAX) + continue; + + have_io_accounting = true; + if (value > 0) + any_io = true; + + /* Format IO accounting data for inclusion in the structured log message */ + if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the IO accounting data for inclusion in the human language message string, but only + * for the bytes counters (and not for the operations counters) */ + if (k == CGROUP_IO_READ_BYTES) { + assert(!rr); + rr = strjoin("read ", format_bytes(buf, sizeof(buf), value), " from disk"); + if (!rr) { + r = log_oom(); + goto finish; + } + } else if (k == CGROUP_IO_WRITE_BYTES) { + assert(!wr); + wr = strjoin("written ", format_bytes(buf, sizeof(buf), value), " to disk"); + if (!wr) { + r = log_oom(); + goto finish; + } + } + + if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES)) + log_level = raise_level(log_level, + value > MENTIONWORTHY_IO_BYTES, + value > NOTICEWORTHY_IO_BYTES); + } + + if (have_io_accounting) { + if (any_io) { + if (rr) + message_parts[n_message_parts++] = TAKE_PTR(rr); + if (wr) + message_parts[n_message_parts++] = TAKE_PTR(wr); + + } else { + char *k; + + k = strdup("no IO"); + if (!k) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = k; + } + } + + for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + char buf[FORMAT_BYTES_MAX] = ""; + uint64_t value = UINT64_MAX; + + assert(ip_fields[m]); + + (void) unit_get_ip_accounting(u, m, &value); + if (value == UINT64_MAX) + continue; + + have_ip_accounting = true; + if (value > 0) + any_traffic = true; + + /* Format IP accounting data for inclusion in the structured log message */ + if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the IP accounting data for inclusion in the human language message string, but only for the + * bytes counters (and not for the packets counters) */ + if (m == CGROUP_IP_INGRESS_BYTES) { + assert(!igress); + igress = strjoin("received ", format_bytes(buf, sizeof(buf), value), " IP traffic"); + if (!igress) { + r = log_oom(); + goto finish; + } + } else if (m == CGROUP_IP_EGRESS_BYTES) { + assert(!egress); + egress = strjoin("sent ", format_bytes(buf, sizeof(buf), value), " IP traffic"); + if (!egress) { + r = log_oom(); + goto finish; + } + } + + if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) + log_level = raise_level(log_level, + value > MENTIONWORTHY_IP_BYTES, + value > NOTICEWORTHY_IP_BYTES); + } + + if (have_ip_accounting) { + if (any_traffic) { + if (igress) + message_parts[n_message_parts++] = TAKE_PTR(igress); + if (egress) + message_parts[n_message_parts++] = TAKE_PTR(egress); + + } else { + char *k; + + k = strdup("no IP traffic"); + if (!k) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = k; + } + } + + /* Is there any accounting data available at all? */ + if (n_iovec == 0) { + r = 0; + goto finish; + } + + if (n_message_parts == 0) + t = strjoina("MESSAGE=", u->id, ": Completed."); + else { + _cleanup_free_ char *joined; + + message_parts[n_message_parts] = NULL; + + joined = strv_join(message_parts, ", "); + if (!joined) { + r = log_oom(); + goto finish; + } + + joined[0] = ascii_toupper(joined[0]); + t = strjoina("MESSAGE=", u->id, ": ", joined, "."); + } + + /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them, + * and hence don't increase n_iovec for them */ + iovec[n_iovec] = IOVEC_MAKE_STRING(t); + iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR); + + t = strjoina(u->manager->unit_log_field, u->id); + iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t); + + t = strjoina(u->manager->invocation_log_field, u->invocation_id_string); + iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t); + + log_struct_iovec(log_level, iovec, n_iovec + 4); + r = 0; + +finish: + for (size_t i = 0; i < n_message_parts; i++) + free(message_parts[i]); + + for (size_t i = 0; i < n_iovec; i++) + free(iovec[i].iov_base); + + return r; + +} + +static void unit_update_on_console(Unit *u) { + bool b; + + assert(u); + + b = unit_needs_console(u); + if (u->on_console == b) + return; + + u->on_console = b; + if (b) + manager_ref_console(u->manager); + else + manager_unref_console(u->manager); +} + +static void unit_emit_audit_start(Unit *u) { + assert(u); + + if (u->type != UNIT_SERVICE) + return; + + /* Write audit record if we have just finished starting up */ + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_START, true); + u->in_audit = true; +} + +static void unit_emit_audit_stop(Unit *u, UnitActiveState state) { + assert(u); + + if (u->type != UNIT_SERVICE) + return; + + if (u->in_audit) { + /* Write audit record if we have just finished shutting down */ + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_STOP, state == UNIT_INACTIVE); + u->in_audit = false; + } else { + /* Hmm, if there was no start record written write it now, so that we always have a nice pair */ + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_START, state == UNIT_INACTIVE); + + if (state == UNIT_INACTIVE) + manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_STOP, true); + } +} + +static bool unit_process_job(Job *j, UnitActiveState ns, UnitNotifyFlags flags) { + bool unexpected = false; + JobResult result; + + assert(j); + + if (j->state == JOB_WAITING) + + /* So we reached a different state for this job. Let's see if we can run it now if it failed previously + * due to EAGAIN. */ + job_add_to_run_queue(j); + + /* Let's check whether the unit's new state constitutes a finished job, or maybe contradicts a running job and + * hence needs to invalidate jobs. */ + + switch (j->type) { + + case JOB_START: + case JOB_VERIFY_ACTIVE: + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) + job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (j->state == JOB_RUNNING && ns != UNIT_ACTIVATING) { + unexpected = true; + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) { + if (ns == UNIT_FAILED) + result = JOB_FAILED; + else if (FLAGS_SET(flags, UNIT_NOTIFY_SKIP_CONDITION)) + result = JOB_SKIPPED; + else + result = JOB_DONE; + + job_finish_and_invalidate(j, result, true, false); + } + } + + break; + + case JOB_RELOAD: + case JOB_RELOAD_OR_START: + case JOB_TRY_RELOAD: + + if (j->state == JOB_RUNNING) { + if (ns == UNIT_ACTIVE) + job_finish_and_invalidate(j, (flags & UNIT_NOTIFY_RELOAD_FAILURE) ? JOB_FAILED : JOB_DONE, true, false); + else if (!IN_SET(ns, UNIT_ACTIVATING, UNIT_RELOADING)) { + unexpected = true; + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false); + } + } + + break; + + case JOB_STOP: + case JOB_RESTART: + case JOB_TRY_RESTART: + + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + job_finish_and_invalidate(j, JOB_DONE, true, false); + else if (j->state == JOB_RUNNING && ns != UNIT_DEACTIVATING) { + unexpected = true; + job_finish_and_invalidate(j, JOB_FAILED, true, false); + } + + break; + + default: + assert_not_reached("Job type unknown"); + } + + return unexpected; +} + +void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags) { + const char *reason; + Manager *m; + + assert(u); + assert(os < _UNIT_ACTIVE_STATE_MAX); + assert(ns < _UNIT_ACTIVE_STATE_MAX); + + /* Note that this is called for all low-level state changes, even if they might map to the same high-level + * UnitActiveState! That means that ns == os is an expected behavior here. For example: if a mount point is + * remounted this function will be called too! */ + + m = u->manager; + + /* Let's enqueue the change signal early. In case this unit has a job associated we want that this unit is in + * the bus queue, so that any job change signal queued will force out the unit change signal first. */ + unit_add_to_dbus_queue(u); + + /* Update systemd-oomd on the property/state change */ + if (os != ns) { + /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop + * monitoring. + * Also send an update whenever the unit goes active; this is to handle a case where an override file + * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to + * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't + * have the information on the property. Thus, indiscriminately send an update. */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns) || UNIT_IS_ACTIVE_OR_RELOADING(ns)) + (void) manager_varlink_send_managed_oom_update(u); + } + + /* Update timestamps for state changes */ + if (!MANAGER_IS_RELOADING(m)) { + dual_timestamp_get(&u->state_change_timestamp); + + if (UNIT_IS_INACTIVE_OR_FAILED(os) && !UNIT_IS_INACTIVE_OR_FAILED(ns)) + u->inactive_exit_timestamp = u->state_change_timestamp; + else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_INACTIVE_OR_FAILED(ns)) + u->inactive_enter_timestamp = u->state_change_timestamp; + + if (!UNIT_IS_ACTIVE_OR_RELOADING(os) && UNIT_IS_ACTIVE_OR_RELOADING(ns)) + u->active_enter_timestamp = u->state_change_timestamp; + else if (UNIT_IS_ACTIVE_OR_RELOADING(os) && !UNIT_IS_ACTIVE_OR_RELOADING(ns)) + u->active_exit_timestamp = u->state_change_timestamp; + } + + /* Keep track of failed units */ + (void) manager_update_failed_units(m, u, ns == UNIT_FAILED); + + /* Make sure the cgroup and state files are always removed when we become inactive */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) { + unit_prune_cgroup(u); + unit_unlink_state_files(u); + } + + unit_update_on_console(u); + + if (!MANAGER_IS_RELOADING(m)) { + bool unexpected; + + /* Let's propagate state changes to the job */ + if (u->job) + unexpected = unit_process_job(u->job, ns, flags); + else + unexpected = true; + + /* If this state change happened without being requested by a job, then let's retroactively start or + * stop dependencies. We skip that step when deserializing, since we don't want to create any + * additional jobs just because something is already activated. */ + + if (unexpected) { + if (UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_ACTIVE_OR_ACTIVATING(ns)) + retroactively_start_dependencies(u); + else if (UNIT_IS_ACTIVE_OR_ACTIVATING(os) && UNIT_IS_INACTIVE_OR_DEACTIVATING(ns)) + retroactively_stop_dependencies(u); + } + + /* stop unneeded units regardless if going down was expected or not */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns)) + check_unneeded_dependencies(u); + + if (ns != os && ns == UNIT_FAILED) { + log_unit_debug(u, "Unit entered failed state."); + + if (!(flags & UNIT_NOTIFY_WILL_AUTO_RESTART)) + unit_start_on_failure(u); + } + + if (UNIT_IS_ACTIVE_OR_RELOADING(ns) && !UNIT_IS_ACTIVE_OR_RELOADING(os)) { + /* This unit just finished starting up */ + + unit_emit_audit_start(u); + manager_send_unit_plymouth(m, u); + } + + if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) { + /* This unit just stopped/failed. */ + + unit_emit_audit_stop(u, ns); + unit_log_resources(u); + } + } + + manager_recheck_journal(m); + manager_recheck_dbus(m); + + unit_trigger_notify(u); + + if (!MANAGER_IS_RELOADING(m)) { + /* Maybe we finished startup and are now ready for being stopped because unneeded? */ + unit_submit_to_stop_when_unneeded_queue(u); + + /* Maybe we finished startup, but something we needed has vanished? Let's die then. (This happens when + * something BindsTo= to a Type=oneshot unit, as these units go directly from starting to inactive, + * without ever entering started.) */ + unit_check_binds_to(u); + + if (os != UNIT_FAILED && ns == UNIT_FAILED) { + reason = strjoina("unit ", u->id, " failed"); + emergency_action(m, u->failure_action, 0, u->reboot_arg, unit_failure_action_exit_status(u), reason); + } else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && ns == UNIT_INACTIVE) { + reason = strjoina("unit ", u->id, " succeeded"); + emergency_action(m, u->success_action, 0, u->reboot_arg, unit_success_action_exit_status(u), reason); + } + } + + unit_add_to_gc_queue(u); +} + +int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) { + int r; + + assert(u); + assert(pid_is_valid(pid)); + + /* Watch a specific PID */ + + /* Caller might be sure that this PID belongs to this unit only. Let's take this + * opportunity to remove any stalled references to this PID as they can be created + * easily (when watching a process which is not our direct child). */ + if (exclusive) + manager_unwatch_pid(u->manager, pid); + + r = set_ensure_allocated(&u->pids, NULL); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&u->manager->watch_pids, NULL); + if (r < 0) + return r; + + /* First try, let's add the unit keyed by "pid". */ + r = hashmap_put(u->manager->watch_pids, PID_TO_PTR(pid), u); + if (r == -EEXIST) { + Unit **array; + bool found = false; + size_t n = 0; + + /* OK, the "pid" key is already assigned to a different unit. Let's see if the "-pid" key (which points + * to an array of Units rather than just a Unit), lists us already. */ + + array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid)); + if (array) + for (; array[n]; n++) + if (array[n] == u) + found = true; + + if (found) /* Found it already? if so, do nothing */ + r = 0; + else { + Unit **new_array; + + /* Allocate a new array */ + new_array = new(Unit*, n + 2); + if (!new_array) + return -ENOMEM; + + memcpy_safe(new_array, array, sizeof(Unit*) * n); + new_array[n] = u; + new_array[n+1] = NULL; + + /* Add or replace the old array */ + r = hashmap_replace(u->manager->watch_pids, PID_TO_PTR(-pid), new_array); + if (r < 0) { + free(new_array); + return r; + } + + free(array); + } + } else if (r < 0) + return r; + + r = set_put(u->pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + + return 0; +} + +void unit_unwatch_pid(Unit *u, pid_t pid) { + Unit **array; + + assert(u); + assert(pid_is_valid(pid)); + + /* First let's drop the unit in case it's keyed as "pid". */ + (void) hashmap_remove_value(u->manager->watch_pids, PID_TO_PTR(pid), u); + + /* Then, let's also drop the unit, in case it's in the array keyed by -pid */ + array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid)); + if (array) { + /* Let's iterate through the array, dropping our own entry */ + + size_t m = 0; + for (size_t n = 0; array[n]; n++) + if (array[n] != u) + array[m++] = array[n]; + array[m] = NULL; + + if (m == 0) { + /* The array is now empty, remove the entire entry */ + assert_se(hashmap_remove(u->manager->watch_pids, PID_TO_PTR(-pid)) == array); + free(array); + } + } + + (void) set_remove(u->pids, PID_TO_PTR(pid)); +} + +void unit_unwatch_all_pids(Unit *u) { + assert(u); + + while (!set_isempty(u->pids)) + unit_unwatch_pid(u, PTR_TO_PID(set_first(u->pids))); + + u->pids = set_free(u->pids); +} + +static void unit_tidy_watch_pids(Unit *u) { + pid_t except1, except2; + void *e; + + assert(u); + + /* Cleans dead PIDs from our list */ + + except1 = unit_main_pid(u); + except2 = unit_control_pid(u); + + SET_FOREACH(e, u->pids) { + pid_t pid = PTR_TO_PID(e); + + if (pid == except1 || pid == except2) + continue; + + if (!pid_is_unwaited(pid)) + unit_unwatch_pid(u, pid); + } +} + +static int on_rewatch_pids_event(sd_event_source *s, void *userdata) { + Unit *u = userdata; + + assert(s); + assert(u); + + unit_tidy_watch_pids(u); + unit_watch_all_pids(u); + + /* If the PID set is empty now, then let's finish this off. */ + unit_synthesize_cgroup_empty_event(u); + + return 0; +} + +int unit_enqueue_rewatch_pids(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new + * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it + * involves issuing kill(pid, 0) on all processes we watch. */ + + if (!u->rewatch_pids_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m"); + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m"); + + (void) sd_event_source_set_description(s, "tidy-watch-pids"); + + u->rewatch_pids_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m"); + + return 0; +} + +void unit_dequeue_rewatch_pids(Unit *u) { + int r; + assert(u); + + if (!u->rewatch_pids_event_source) + return; + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF); + if (r < 0) + log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m"); + + u->rewatch_pids_event_source = sd_event_source_unref(u->rewatch_pids_event_source); +} + +bool unit_job_is_applicable(Unit *u, JobType j) { + assert(u); + assert(j >= 0 && j < _JOB_TYPE_MAX); + + switch (j) { + + case JOB_VERIFY_ACTIVE: + case JOB_START: + case JOB_NOP: + /* Note that we don't check unit_can_start() here. That's because .device units and suchlike are not + * startable by us but may appear due to external events, and it thus makes sense to permit enqueuing + * jobs for it. */ + return true; + + case JOB_STOP: + /* Similar as above. However, perpetual units can never be stopped (neither explicitly nor due to + * external events), hence it makes no sense to permit enqueuing such a request either. */ + return !u->perpetual; + + case JOB_RESTART: + case JOB_TRY_RESTART: + return unit_can_stop(u) && unit_can_start(u); + + case JOB_RELOAD: + case JOB_TRY_RELOAD: + return unit_can_reload(u); + + case JOB_RELOAD_OR_START: + return unit_can_reload(u) && unit_can_start(u); + + default: + assert_not_reached("Invalid job type"); + } +} + +static void maybe_warn_about_dependency(Unit *u, const char *other, UnitDependency dependency) { + assert(u); + + /* Only warn about some unit types */ + if (!IN_SET(dependency, UNIT_CONFLICTS, UNIT_CONFLICTED_BY, UNIT_BEFORE, UNIT_AFTER, UNIT_ON_FAILURE, UNIT_TRIGGERS, UNIT_TRIGGERED_BY)) + return; + + if (streq_ptr(u->id, other)) + log_unit_warning(u, "Dependency %s=%s dropped", unit_dependency_to_string(dependency), u->id); + else + log_unit_warning(u, "Dependency %s=%s dropped, merged into %s", unit_dependency_to_string(dependency), strna(other), u->id); +} + +static int unit_add_dependency_hashmap( + Hashmap **h, + Unit *other, + UnitDependencyMask origin_mask, + UnitDependencyMask destination_mask) { + + UnitDependencyInfo info; + int r; + + assert(h); + assert(other); + assert(origin_mask < _UNIT_DEPENDENCY_MASK_FULL); + assert(destination_mask < _UNIT_DEPENDENCY_MASK_FULL); + assert(origin_mask > 0 || destination_mask > 0); + + r = hashmap_ensure_allocated(h, NULL); + if (r < 0) + return r; + + assert_cc(sizeof(void*) == sizeof(info)); + + info.data = hashmap_get(*h, other); + if (info.data) { + /* Entry already exists. Add in our mask. */ + + if (FLAGS_SET(origin_mask, info.origin_mask) && + FLAGS_SET(destination_mask, info.destination_mask)) + return 0; /* NOP */ + + info.origin_mask |= origin_mask; + info.destination_mask |= destination_mask; + + r = hashmap_update(*h, other, info.data); + } else { + info = (UnitDependencyInfo) { + .origin_mask = origin_mask, + .destination_mask = destination_mask, + }; + + r = hashmap_put(*h, other, info.data); + } + if (r < 0) + return r; + + return 1; +} + +int unit_add_dependency( + Unit *u, + UnitDependency d, + Unit *other, + bool add_reference, + UnitDependencyMask mask) { + + static const UnitDependency inverse_table[_UNIT_DEPENDENCY_MAX] = { + [UNIT_REQUIRES] = UNIT_REQUIRED_BY, + [UNIT_WANTS] = UNIT_WANTED_BY, + [UNIT_REQUISITE] = UNIT_REQUISITE_OF, + [UNIT_BINDS_TO] = UNIT_BOUND_BY, + [UNIT_PART_OF] = UNIT_CONSISTS_OF, + [UNIT_REQUIRED_BY] = UNIT_REQUIRES, + [UNIT_REQUISITE_OF] = UNIT_REQUISITE, + [UNIT_WANTED_BY] = UNIT_WANTS, + [UNIT_BOUND_BY] = UNIT_BINDS_TO, + [UNIT_CONSISTS_OF] = UNIT_PART_OF, + [UNIT_CONFLICTS] = UNIT_CONFLICTED_BY, + [UNIT_CONFLICTED_BY] = UNIT_CONFLICTS, + [UNIT_BEFORE] = UNIT_AFTER, + [UNIT_AFTER] = UNIT_BEFORE, + [UNIT_ON_FAILURE] = _UNIT_DEPENDENCY_INVALID, + [UNIT_REFERENCES] = UNIT_REFERENCED_BY, + [UNIT_REFERENCED_BY] = UNIT_REFERENCES, + [UNIT_TRIGGERS] = UNIT_TRIGGERED_BY, + [UNIT_TRIGGERED_BY] = UNIT_TRIGGERS, + [UNIT_PROPAGATES_RELOAD_TO] = UNIT_RELOAD_PROPAGATED_FROM, + [UNIT_RELOAD_PROPAGATED_FROM] = UNIT_PROPAGATES_RELOAD_TO, + [UNIT_JOINS_NAMESPACE_OF] = UNIT_JOINS_NAMESPACE_OF, + }; + Unit *original_u = u, *original_other = other; + int r; + /* Helper to know whether sending a notification is necessary or not: + * if the dependency is already there, no need to notify! */ + bool noop = true; + + assert(u); + assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX); + assert(other); + + u = unit_follow_merge(u); + other = unit_follow_merge(other); + + /* We won't allow dependencies on ourselves. We will not + * consider them an error however. */ + if (u == other) { + maybe_warn_about_dependency(original_u, original_other->id, d); + return 0; + } + + /* Note that ordering a device unit after a unit is permitted since it + * allows to start its job running timeout at a specific time. */ + if (d == UNIT_BEFORE && other->type == UNIT_DEVICE) { + log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id); + return 0; + } + + if (d == UNIT_ON_FAILURE && !UNIT_VTABLE(u)->can_fail) { + log_unit_warning(u, "Requested dependency OnFailure=%s ignored (%s units cannot fail).", other->id, unit_type_to_string(u->type)); + return 0; + } + + if (d == UNIT_TRIGGERS && !UNIT_VTABLE(u)->can_trigger) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency Triggers=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(u->type)); + if (d == UNIT_TRIGGERED_BY && !UNIT_VTABLE(other)->can_trigger) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Requested dependency TriggeredBy=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(other->type)); + + r = unit_add_dependency_hashmap(u->dependencies + d, other, mask, 0); + if (r < 0) + return r; + else if (r > 0) + noop = false; + + if (inverse_table[d] != _UNIT_DEPENDENCY_INVALID && inverse_table[d] != d) { + r = unit_add_dependency_hashmap(other->dependencies + inverse_table[d], u, 0, mask); + if (r < 0) + return r; + else if (r > 0) + noop = false; + } + + if (add_reference) { + r = unit_add_dependency_hashmap(u->dependencies + UNIT_REFERENCES, other, mask, 0); + if (r < 0) + return r; + else if (r > 0) + noop = false; + + r = unit_add_dependency_hashmap(other->dependencies + UNIT_REFERENCED_BY, u, 0, mask); + if (r < 0) + return r; + else if (r > 0) + noop = false; + } + + if (!noop) + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask) { + int r; + + assert(u); + + r = unit_add_dependency(u, d, other, add_reference, mask); + if (r < 0) + return r; + + return unit_add_dependency(u, e, other, add_reference, mask); +} + +static int resolve_template(Unit *u, const char *name, char **buf, const char **ret) { + int r; + + assert(u); + assert(name); + assert(buf); + assert(ret); + + if (!unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + *buf = NULL; + *ret = name; + return 0; + } + + if (u->instance) + r = unit_name_replace_instance(name, u->instance, buf); + else { + _cleanup_free_ char *i = NULL; + + r = unit_name_to_prefix(u->id, &i); + if (r < 0) + return r; + + r = unit_name_replace_instance(name, i, buf); + } + if (r < 0) + return r; + + *ret = *buf; + return 0; +} + +int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask) { + _cleanup_free_ char *buf = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + r = resolve_template(u, name, &buf, &name); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, name, NULL, NULL, &other); + if (r < 0) + return r; + + return unit_add_dependency(u, d, other, add_reference, mask); +} + +int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask) { + _cleanup_free_ char *buf = NULL; + Unit *other; + int r; + + assert(u); + assert(name); + + r = resolve_template(u, name, &buf, &name); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, name, NULL, NULL, &other); + if (r < 0) + return r; + + return unit_add_two_dependencies(u, d, e, other, add_reference, mask); +} + +int set_unit_path(const char *p) { + /* This is mostly for debug purposes */ + if (setenv("SYSTEMD_UNIT_PATH", p, 1) < 0) + return -errno; + + return 0; +} + +char *unit_dbus_path(Unit *u) { + assert(u); + + if (!u->id) + return NULL; + + return unit_dbus_path_from_name(u->id); +} + +char *unit_dbus_path_invocation_id(Unit *u) { + assert(u); + + if (sd_id128_is_null(u->invocation_id)) + return NULL; + + return unit_dbus_path_from_name(u->invocation_id_string); +} + +static int unit_set_invocation_id(Unit *u, sd_id128_t id) { + int r; + + assert(u); + + /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */ + + if (sd_id128_equal(u->invocation_id, id)) + return 0; + + if (!sd_id128_is_null(u->invocation_id)) + (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (sd_id128_is_null(id)) { + r = 0; + goto reset; + } + + r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops); + if (r < 0) + goto reset; + + u->invocation_id = id; + sd_id128_to_string(id, u->invocation_id_string); + + r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u); + if (r < 0) + goto reset; + + return 0; + +reset: + u->invocation_id = SD_ID128_NULL; + u->invocation_id_string[0] = 0; + return r; +} + +int unit_set_slice(Unit *u, Unit *slice) { + assert(u); + assert(slice); + + /* Sets the unit slice if it has not been set before. Is extra + * careful, to only allow this for units that actually have a + * cgroup context. Also, we don't allow to set this for slices + * (since the parent slice is derived from the name). Make + * sure the unit we set is actually a slice. */ + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EOPNOTSUPP; + + if (u->type == UNIT_SLICE) + return -EINVAL; + + if (unit_active_state(u) != UNIT_INACTIVE) + return -EBUSY; + + if (slice->type != UNIT_SLICE) + return -EINVAL; + + if (unit_has_name(u, SPECIAL_INIT_SCOPE) && + !unit_has_name(slice, SPECIAL_ROOT_SLICE)) + return -EPERM; + + if (UNIT_DEREF(u->slice) == slice) + return 0; + + /* Disallow slice changes if @u is already bound to cgroups */ + if (UNIT_ISSET(u->slice) && u->cgroup_realized) + return -EBUSY; + + unit_ref_set(&u->slice, u, slice); + return 1; +} + +int unit_set_default_slice(Unit *u) { + const char *slice_name; + Unit *slice; + int r; + + assert(u); + + if (UNIT_ISSET(u->slice)) + return 0; + + if (u->instance) { + _cleanup_free_ char *prefix = NULL, *escaped = NULL; + + /* Implicitly place all instantiated units in their + * own per-template slice */ + + r = unit_name_to_prefix(u->id, &prefix); + if (r < 0) + return r; + + /* The prefix is already escaped, but it might include + * "-" which has a special meaning for slice units, + * hence escape it here extra. */ + escaped = unit_name_escape(prefix); + if (!escaped) + return -ENOMEM; + + if (MANAGER_IS_SYSTEM(u->manager)) + slice_name = strjoina("system-", escaped, ".slice"); + else + slice_name = strjoina("app-", escaped, ".slice"); + + } else if (unit_is_extrinsic(u)) + /* Keep all extrinsic units (e.g. perpetual units and swap and mount units in user mode) in + * the root slice. They don't really belong in one of the subslices. */ + slice_name = SPECIAL_ROOT_SLICE; + + else if (MANAGER_IS_SYSTEM(u->manager)) + slice_name = SPECIAL_SYSTEM_SLICE; + else + slice_name = SPECIAL_APP_SLICE; + + r = manager_load_unit(u->manager, slice_name, NULL, NULL, &slice); + if (r < 0) + return r; + + return unit_set_slice(u, slice); +} + +const char *unit_slice_name(Unit *u) { + assert(u); + + if (!UNIT_ISSET(u->slice)) + return NULL; + + return UNIT_DEREF(u->slice)->id; +} + +int unit_load_related_unit(Unit *u, const char *type, Unit **_found) { + _cleanup_free_ char *t = NULL; + int r; + + assert(u); + assert(type); + assert(_found); + + r = unit_name_change_suffix(u->id, type, &t); + if (r < 0) + return r; + if (unit_has_name(u, t)) + return -EINVAL; + + r = manager_load_unit(u->manager, t, NULL, NULL, _found); + assert(r < 0 || *_found != u); + return r; +} + +static int signal_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *new_owner; + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + r = sd_bus_message_read(message, "sss", NULL, NULL, &new_owner); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (UNIT_VTABLE(u)->bus_name_owner_change) + UNIT_VTABLE(u)->bus_name_owner_change(u, empty_to_null(new_owner)); + + return 0; +} + +static int get_name_owner_handler(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const sd_bus_error *e; + const char *new_owner; + Unit *u = userdata; + int r; + + assert(message); + assert(u); + + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); + + e = sd_bus_message_get_error(message); + if (e) { + if (!sd_bus_error_has_name(e, "org.freedesktop.DBus.Error.NameHasNoOwner")) + log_unit_error(u, "Unexpected error response from GetNameOwner(): %s", e->message); + + new_owner = NULL; + } else { + r = sd_bus_message_read(message, "s", &new_owner); + if (r < 0) + return bus_log_parse_error(r); + + assert(!isempty(new_owner)); + } + + if (UNIT_VTABLE(u)->bus_name_owner_change) + UNIT_VTABLE(u)->bus_name_owner_change(u, new_owner); + + return 0; +} + +int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name) { + const char *match; + int r; + + assert(u); + assert(bus); + assert(name); + + if (u->match_bus_slot || u->get_name_owner_slot) + return -EBUSY; + + match = strjoina("type='signal'," + "sender='org.freedesktop.DBus'," + "path='/org/freedesktop/DBus'," + "interface='org.freedesktop.DBus'," + "member='NameOwnerChanged'," + "arg0='", name, "'"); + + r = sd_bus_add_match_async(bus, &u->match_bus_slot, match, signal_name_owner_changed, NULL, u); + if (r < 0) + return r; + + r = sd_bus_call_method_async( + bus, + &u->get_name_owner_slot, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetNameOwner", + get_name_owner_handler, + u, + "s", name); + if (r < 0) { + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + return r; + } + + log_unit_debug(u, "Watching D-Bus name '%s'.", name); + return 0; +} + +int unit_watch_bus_name(Unit *u, const char *name) { + int r; + + assert(u); + assert(name); + + /* Watch a specific name on the bus. We only support one unit + * watching each name for now. */ + + if (u->manager->api_bus) { + /* If the bus is already available, install the match directly. + * Otherwise, just put the name in the list. bus_setup_api() will take care later. */ + r = unit_install_bus_match(u, u->manager->api_bus, name); + if (r < 0) + return log_warning_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name); + } + + r = hashmap_put(u->manager->watch_bus, name, u); + if (r < 0) { + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); + return log_warning_errno(r, "Failed to put bus name to hashmap: %m"); + } + + return 0; +} + +void unit_unwatch_bus_name(Unit *u, const char *name) { + assert(u); + assert(name); + + (void) hashmap_remove_value(u->manager->watch_bus, name, u); + u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot); + u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot); +} + +bool unit_can_serialize(Unit *u) { + assert(u); + + return UNIT_VTABLE(u)->serialize && UNIT_VTABLE(u)->deserialize_item; +} + +static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) { + _cleanup_free_ char *s = NULL; + int r; + + assert(f); + assert(key); + + if (mask == 0) + return 0; + + r = cg_mask_to_string(mask, &s); + if (r < 0) + return log_error_errno(r, "Failed to format cgroup mask: %m"); + + return serialize_item(f, key, s); +} + +static const char *const ip_accounting_metric_field[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes", + [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets", + [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes", + [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets", +}; + +static const char *const io_accounting_metric_field_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base", + [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base", + [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base", + [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base", +}; + +static const char *const io_accounting_metric_field_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last", + [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last", + [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last", + [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last", +}; + +int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { + int r; + + assert(u); + assert(f); + assert(fds); + + if (unit_can_serialize(u)) { + r = UNIT_VTABLE(u)->serialize(u, f, fds); + if (r < 0) + return r; + } + + (void) serialize_dual_timestamp(f, "state-change-timestamp", &u->state_change_timestamp); + + (void) serialize_dual_timestamp(f, "inactive-exit-timestamp", &u->inactive_exit_timestamp); + (void) serialize_dual_timestamp(f, "active-enter-timestamp", &u->active_enter_timestamp); + (void) serialize_dual_timestamp(f, "active-exit-timestamp", &u->active_exit_timestamp); + (void) serialize_dual_timestamp(f, "inactive-enter-timestamp", &u->inactive_enter_timestamp); + + (void) serialize_dual_timestamp(f, "condition-timestamp", &u->condition_timestamp); + (void) serialize_dual_timestamp(f, "assert-timestamp", &u->assert_timestamp); + + if (dual_timestamp_is_set(&u->condition_timestamp)) + (void) serialize_bool(f, "condition-result", u->condition_result); + + if (dual_timestamp_is_set(&u->assert_timestamp)) + (void) serialize_bool(f, "assert-result", u->assert_result); + + (void) serialize_bool(f, "transient", u->transient); + (void) serialize_bool(f, "in-audit", u->in_audit); + + (void) serialize_bool(f, "exported-invocation-id", u->exported_invocation_id); + (void) serialize_bool(f, "exported-log-level-max", u->exported_log_level_max); + (void) serialize_bool(f, "exported-log-extra-fields", u->exported_log_extra_fields); + (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval); + (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst); + + (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base); + if (u->cpu_usage_last != NSEC_INFINITY) + (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last); + + if (u->managed_oom_kill_last > 0) + (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last); + + if (u->oom_kill_last > 0) + (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last); + + for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) { + (void) serialize_item_format(f, io_accounting_metric_field_base[im], "%" PRIu64, u->io_accounting_base[im]); + + if (u->io_accounting_last[im] != UINT64_MAX) + (void) serialize_item_format(f, io_accounting_metric_field_last[im], "%" PRIu64, u->io_accounting_last[im]); + } + + if (u->cgroup_path) + (void) serialize_item(f, "cgroup", u->cgroup_path); + + (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized); + (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask); + (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask); + (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask); + + if (uid_is_valid(u->ref_uid)) + (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid); + if (gid_is_valid(u->ref_gid)) + (void) serialize_item_format(f, "ref-gid", GID_FMT, u->ref_gid); + + if (!sd_id128_is_null(u->invocation_id)) + (void) serialize_item_format(f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)); + + (void) serialize_item_format(f, "freezer-state", "%s", freezer_state_to_string(unit_freezer_state(u))); + + bus_track_serialize(u->bus_track, f, "ref"); + + for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + uint64_t v; + + r = unit_get_ip_accounting(u, m, &v); + if (r >= 0) + (void) serialize_item_format(f, ip_accounting_metric_field[m], "%" PRIu64, v); + } + + if (serialize_jobs) { + if (u->job) { + fputs("job\n", f); + job_serialize(u->job, f); + } + + if (u->nop_job) { + fputs("job\n", f); + job_serialize(u->nop_job, f); + } + } + + /* End marker */ + fputc('\n', f); + return 0; +} + +static int unit_deserialize_job(Unit *u, FILE *f) { + _cleanup_(job_freep) Job *j = NULL; + int r; + + assert(u); + assert(f); + + j = job_new_raw(u); + if (!j) + return log_oom(); + + r = job_deserialize(j, f); + if (r < 0) + return r; + + r = job_install_deserialized(j); + if (r < 0) + return r; + + TAKE_PTR(j); + return 0; +} + +int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + assert(f); + assert(fds); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l, *v; + ssize_t m; + size_t k; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) /* eof */ + break; + + l = strstrip(line); + if (isempty(l)) /* End marker */ + break; + + k = strcspn(l, "="); + + if (l[k] == '=') { + l[k] = 0; + v = l+k+1; + } else + v = l+k; + + if (streq(l, "job")) { + if (v[0] == '\0') { + /* New-style serialized job */ + r = unit_deserialize_job(u, f); + if (r < 0) + return r; + } else /* Legacy for pre-44 */ + log_unit_warning(u, "Update from too old systemd versions are unsupported, cannot deserialize job: %s", v); + continue; + } else if (streq(l, "state-change-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->state_change_timestamp); + continue; + } else if (streq(l, "inactive-exit-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->inactive_exit_timestamp); + continue; + } else if (streq(l, "active-enter-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->active_enter_timestamp); + continue; + } else if (streq(l, "active-exit-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->active_exit_timestamp); + continue; + } else if (streq(l, "inactive-enter-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->inactive_enter_timestamp); + continue; + } else if (streq(l, "condition-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->condition_timestamp); + continue; + } else if (streq(l, "assert-timestamp")) { + (void) deserialize_dual_timestamp(v, &u->assert_timestamp); + continue; + } else if (streq(l, "condition-result")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse condition result value %s, ignoring.", v); + else + u->condition_result = r; + + continue; + + } else if (streq(l, "assert-result")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse assert result value %s, ignoring.", v); + else + u->assert_result = r; + + continue; + + } else if (streq(l, "transient")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse transient bool %s, ignoring.", v); + else + u->transient = r; + + continue; + + } else if (streq(l, "in-audit")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse in-audit bool %s, ignoring.", v); + else + u->in_audit = r; + + continue; + + } else if (streq(l, "exported-invocation-id")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported invocation ID bool %s, ignoring.", v); + else + u->exported_invocation_id = r; + + continue; + + } else if (streq(l, "exported-log-level-max")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log level max bool %s, ignoring.", v); + else + u->exported_log_level_max = r; + + continue; + + } else if (streq(l, "exported-log-extra-fields")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log extra fields bool %s, ignoring.", v); + else + u->exported_log_extra_fields = r; + + continue; + + } else if (streq(l, "exported-log-rate-limit-interval")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log rate limit interval %s, ignoring.", v); + else + u->exported_log_ratelimit_interval = r; + + continue; + + } else if (streq(l, "exported-log-rate-limit-burst")) { + + r = parse_boolean(v); + if (r < 0) + log_unit_debug(u, "Failed to parse exported log rate limit burst %s, ignoring.", v); + else + u->exported_log_ratelimit_burst = r; + + continue; + + } else if (STR_IN_SET(l, "cpu-usage-base", "cpuacct-usage-base")) { + + r = safe_atou64(v, &u->cpu_usage_base); + if (r < 0) + log_unit_debug(u, "Failed to parse CPU usage base %s, ignoring.", v); + + continue; + + } else if (streq(l, "cpu-usage-last")) { + + r = safe_atou64(v, &u->cpu_usage_last); + if (r < 0) + log_unit_debug(u, "Failed to read CPU usage last %s, ignoring.", v); + + continue; + + } else if (streq(l, "managed-oom-kill-last")) { + + r = safe_atou64(v, &u->managed_oom_kill_last); + if (r < 0) + log_unit_debug(u, "Failed to read managed OOM kill last %s, ignoring.", v); + + continue; + + } else if (streq(l, "oom-kill-last")) { + + r = safe_atou64(v, &u->oom_kill_last); + if (r < 0) + log_unit_debug(u, "Failed to read OOM kill last %s, ignoring.", v); + + continue; + + } else if (streq(l, "cgroup")) { + + r = unit_set_cgroup_path(u, v); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v); + + (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); + + continue; + } else if (streq(l, "cgroup-realized")) { + int b; + + b = parse_boolean(v); + if (b < 0) + log_unit_debug(u, "Failed to parse cgroup-realized bool %s, ignoring.", v); + else + u->cgroup_realized = b; + + continue; + + } else if (streq(l, "cgroup-realized-mask")) { + + r = cg_mask_from_string(v, &u->cgroup_realized_mask); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup-realized-mask %s, ignoring.", v); + continue; + + } else if (streq(l, "cgroup-enabled-mask")) { + + r = cg_mask_from_string(v, &u->cgroup_enabled_mask); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup-enabled-mask %s, ignoring.", v); + continue; + + } else if (streq(l, "cgroup-invalidated-mask")) { + + r = cg_mask_from_string(v, &u->cgroup_invalidated_mask); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup-invalidated-mask %s, ignoring.", v); + continue; + + } else if (streq(l, "ref-uid")) { + uid_t uid; + + r = parse_uid(v, &uid); + if (r < 0) + log_unit_debug(u, "Failed to parse referenced UID %s, ignoring.", v); + else + unit_ref_uid_gid(u, uid, GID_INVALID); + + continue; + + } else if (streq(l, "ref-gid")) { + gid_t gid; + + r = parse_gid(v, &gid); + if (r < 0) + log_unit_debug(u, "Failed to parse referenced GID %s, ignoring.", v); + else + unit_ref_uid_gid(u, UID_INVALID, gid); + + continue; + + } else if (streq(l, "ref")) { + + r = strv_extend(&u->deserialized_refs, v); + if (r < 0) + return log_oom(); + + continue; + } else if (streq(l, "invocation-id")) { + sd_id128_t id; + + r = sd_id128_from_string(v, &id); + if (r < 0) + log_unit_debug(u, "Failed to parse invocation id %s, ignoring.", v); + else { + r = unit_set_invocation_id(u, id); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m"); + } + + continue; + } else if (streq(l, "freezer-state")) { + FreezerState s; + + s = freezer_state_from_string(v); + if (s < 0) + log_unit_debug(u, "Failed to deserialize freezer-state '%s', ignoring.", v); + else + u->freezer_state = s; + + continue; + } + + /* Check if this is an IP accounting metric serialization field */ + m = string_table_lookup(ip_accounting_metric_field, ELEMENTSOF(ip_accounting_metric_field), l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v); + else + u->ip_accounting_extra[m] = c; + continue; + } + + m = string_table_lookup(io_accounting_metric_field_base, ELEMENTSOF(io_accounting_metric_field_base), l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v); + else + u->io_accounting_base[m] = c; + continue; + } + + m = string_table_lookup(io_accounting_metric_field_last, ELEMENTSOF(io_accounting_metric_field_last), l); + if (m >= 0) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v); + else + u->io_accounting_last[m] = c; + continue; + } + + if (unit_can_serialize(u)) { + r = exec_runtime_deserialize_compat(u, l, v, fds); + if (r < 0) { + log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l); + continue; + } + + /* Returns positive if key was handled by the call */ + if (r > 0) + continue; + + r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds); + if (r < 0) + log_unit_warning(u, "Failed to deserialize unit parameter '%s', ignoring.", l); + } + } + + /* Versions before 228 did not carry a state change timestamp. In this case, take the current time. This is + * useful, so that timeouts based on this timestamp don't trigger too early, and is in-line with the logic from + * before 228 where the base for timeouts was not persistent across reboots. */ + + if (!dual_timestamp_is_set(&u->state_change_timestamp)) + dual_timestamp_get(&u->state_change_timestamp); + + /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings applied + * after we are done. For that we invalidate anything already realized, so that we can realize it again. */ + unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); + unit_invalidate_cgroup_bpf(u); + + return 0; +} + +int unit_deserialize_skip(FILE *f) { + int r; + assert(f); + + /* Skip serialized data for this unit. We don't know what it is. */ + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) + return 0; + + l = strstrip(line); + + /* End marker */ + if (isempty(l)) + return 1; + } +} + +int unit_add_node_dependency(Unit *u, const char *what, UnitDependency dep, UnitDependencyMask mask) { + _cleanup_free_ char *e = NULL; + Unit *device; + int r; + + assert(u); + + /* Adds in links to the device node that this unit is based on */ + if (isempty(what)) + return 0; + + if (!is_device_path(what)) + return 0; + + /* When device units aren't supported (such as in a container), don't create dependencies on them. */ + if (!unit_type_supported(UNIT_DEVICE)) + return 0; + + r = unit_name_from_path(what, ".device", &e); + if (r < 0) + return r; + + r = manager_load_unit(u->manager, e, NULL, NULL, &device); + if (r < 0) + return r; + + if (dep == UNIT_REQUIRES && device_shall_be_bound_by(device, u)) + dep = UNIT_BINDS_TO; + + return unit_add_two_dependencies(u, UNIT_AFTER, + MANAGER_IS_SYSTEM(u->manager) ? dep : UNIT_WANTS, + device, true, mask); +} + +int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask) { + _cleanup_free_ char *escaped = NULL, *target = NULL; + int r; + + assert(u); + + if (isempty(what)) + return 0; + + if (!path_startswith(what, "/dev/")) + return 0; + + /* If we don't support devices, then also don't bother with blockdev@.target */ + if (!unit_type_supported(UNIT_DEVICE)) + return 0; + + r = unit_name_path_escape(what, &escaped); + if (r < 0) + return r; + + r = unit_name_build("blockdev", escaped, ".target", &target); + if (r < 0) + return r; + + return unit_add_dependency_by_name(u, UNIT_AFTER, target, true, mask); +} + +int unit_coldplug(Unit *u) { + int r = 0, q; + char **i; + Job *uj; + + assert(u); + + /* Make sure we don't enter a loop, when coldplugging recursively. */ + if (u->coldplugged) + return 0; + + u->coldplugged = true; + + STRV_FOREACH(i, u->deserialized_refs) { + q = bus_unit_track_add_name(u, *i); + if (q < 0 && r >= 0) + r = q; + } + u->deserialized_refs = strv_free(u->deserialized_refs); + + if (UNIT_VTABLE(u)->coldplug) { + q = UNIT_VTABLE(u)->coldplug(u); + if (q < 0 && r >= 0) + r = q; + } + + uj = u->job ?: u->nop_job; + if (uj) { + q = job_coldplug(uj); + if (q < 0 && r >= 0) + r = q; + } + + return r; +} + +void unit_catchup(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->catchup) + UNIT_VTABLE(u)->catchup(u); +} + +static bool fragment_mtime_newer(const char *path, usec_t mtime, bool path_masked) { + struct stat st; + + if (!path) + return false; + + /* If the source is some virtual kernel file system, then we assume we watch it anyway, and hence pretend we + * are never out-of-date. */ + if (PATH_STARTSWITH_SET(path, "/proc", "/sys")) + return false; + + if (stat(path, &st) < 0) + /* What, cannot access this anymore? */ + return true; + + if (path_masked) + /* For masked files check if they are still so */ + return !null_or_empty(&st); + else + /* For non-empty files check the mtime */ + return timespec_load(&st.st_mtim) > mtime; + + return false; +} + +bool unit_need_daemon_reload(Unit *u) { + _cleanup_strv_free_ char **t = NULL; + char **path; + + assert(u); + + /* For unit files, we allow masking… */ + if (fragment_mtime_newer(u->fragment_path, u->fragment_mtime, + u->load_state == UNIT_MASKED)) + return true; + + /* Source paths should not be masked… */ + if (fragment_mtime_newer(u->source_path, u->source_mtime, false)) + return true; + + if (u->load_state == UNIT_LOADED) + (void) unit_find_dropin_paths(u, &t); + if (!strv_equal(u->dropin_paths, t)) + return true; + + /* … any drop-ins that are masked are simply omitted from the list. */ + STRV_FOREACH(path, u->dropin_paths) + if (fragment_mtime_newer(*path, u->dropin_mtime, false)) + return true; + + return false; +} + +void unit_reset_failed(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->reset_failed) + UNIT_VTABLE(u)->reset_failed(u); + + ratelimit_reset(&u->start_ratelimit); + u->start_limit_hit = false; +} + +Unit *unit_following(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->following) + return UNIT_VTABLE(u)->following(u); + + return NULL; +} + +bool unit_stop_pending(Unit *u) { + assert(u); + + /* This call does check the current state of the unit. It's + * hence useful to be called from state change calls of the + * unit itself, where the state isn't updated yet. This is + * different from unit_inactive_or_pending() which checks both + * the current state and for a queued job. */ + + return unit_has_job_type(u, JOB_STOP); +} + +bool unit_inactive_or_pending(Unit *u) { + assert(u); + + /* Returns true if the unit is inactive or going down */ + + if (UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u))) + return true; + + if (unit_stop_pending(u)) + return true; + + return false; +} + +bool unit_active_or_pending(Unit *u) { + assert(u); + + /* Returns true if the unit is active or going up */ + + if (UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u))) + return true; + + if (u->job && + IN_SET(u->job->type, JOB_START, JOB_RELOAD_OR_START, JOB_RESTART)) + return true; + + return false; +} + +bool unit_will_restart_default(Unit *u) { + assert(u); + + return unit_has_job_type(u, JOB_START); +} + +bool unit_will_restart(Unit *u) { + assert(u); + + if (!UNIT_VTABLE(u)->will_restart) + return false; + + return UNIT_VTABLE(u)->will_restart(u); +} + +int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error) { + assert(u); + assert(w >= 0 && w < _KILL_WHO_MAX); + assert(SIGNAL_VALID(signo)); + + if (!UNIT_VTABLE(u)->kill) + return -EOPNOTSUPP; + + return UNIT_VTABLE(u)->kill(u, w, signo, error); +} + +static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) { + _cleanup_set_free_ Set *pid_set = NULL; + int r; + + pid_set = set_new(NULL); + if (!pid_set) + return NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + if (main_pid > 0) { + r = set_put(pid_set, PID_TO_PTR(main_pid)); + if (r < 0) + return NULL; + } + + if (control_pid > 0) { + r = set_put(pid_set, PID_TO_PTR(control_pid)); + if (r < 0) + return NULL; + } + + return TAKE_PTR(pid_set); +} + +static int kill_common_log(pid_t pid, int signo, void *userdata) { + _cleanup_free_ char *comm = NULL; + Unit *u = userdata; + + assert(u); + + (void) get_process_comm(pid, &comm); + log_unit_info(u, "Sending signal SIG%s to process " PID_FMT " (%s) on client request.", + signal_to_string(signo), pid, strna(comm)); + + return 1; +} + +int unit_kill_common( + Unit *u, + KillWho who, + int signo, + pid_t main_pid, + pid_t control_pid, + sd_bus_error *error) { + + int r = 0; + bool killed = false; + + /* This is the common implementation for explicit user-requested killing of unit processes, shared by + * various unit types. Do not confuse with unit_kill_context(), which is what we use when we want to + * stop a service ourselves. */ + + if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) { + if (main_pid < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type)); + if (main_pid == 0) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill"); + } + + if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) { + if (control_pid < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type)); + if (control_pid == 0) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill"); + } + + if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) + if (control_pid > 0) { + _cleanup_free_ char *comm = NULL; + (void) get_process_comm(control_pid, &comm); + + if (kill(control_pid, signo) < 0) { + /* Report this failure both to the logs and to the client */ + sd_bus_error_set_errnof( + error, errno, + "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m", + signal_to_string(signo), control_pid, strna(comm)); + r = log_unit_warning_errno( + u, errno, + "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m", + signal_to_string(signo), control_pid, strna(comm)); + } else { + log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.", + signal_to_string(signo), control_pid, strna(comm)); + killed = true; + } + } + + if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) + if (main_pid > 0) { + _cleanup_free_ char *comm = NULL; + (void) get_process_comm(main_pid, &comm); + + if (kill(main_pid, signo) < 0) { + if (r == 0) + sd_bus_error_set_errnof( + error, errno, + "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m", + signal_to_string(signo), main_pid, strna(comm)); + + r = log_unit_warning_errno( + u, errno, + "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m", + signal_to_string(signo), main_pid, strna(comm)); + } else { + log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.", + signal_to_string(signo), main_pid, strna(comm)); + killed = true; + } + } + + if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path) { + _cleanup_set_free_ Set *pid_set = NULL; + int q; + + /* Exclude the main/control pids from being killed via the cgroup */ + pid_set = unit_pid_set(main_pid, control_pid); + if (!pid_set) + return log_oom(); + + q = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, signo, 0, pid_set, kill_common_log, u); + if (q < 0) { + if (!IN_SET(q, -ESRCH, -ENOENT)) { + if (r == 0) + sd_bus_error_set_errnof( + error, q, + "Failed to send signal SIG%s to auxiliary processes: %m", + signal_to_string(signo)); + + r = log_unit_warning_errno( + u, q, + "Failed to send signal SIG%s to auxiliary processes on client request: %m", + signal_to_string(signo)); + } + } else + killed = true; + } + + /* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */ + if (r == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL)) + return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill"); + + return r; +} + +int unit_following_set(Unit *u, Set **s) { + assert(u); + assert(s); + + if (UNIT_VTABLE(u)->following_set) + return UNIT_VTABLE(u)->following_set(u, s); + + *s = NULL; + return 0; +} + +UnitFileState unit_get_unit_file_state(Unit *u) { + int r; + + assert(u); + + if (u->unit_file_state < 0 && u->fragment_path) { + r = unit_file_get_state( + u->manager->unit_file_scope, + NULL, + u->id, + &u->unit_file_state); + if (r < 0) + u->unit_file_state = UNIT_FILE_BAD; + } + + return u->unit_file_state; +} + +int unit_get_unit_file_preset(Unit *u) { + assert(u); + + if (u->unit_file_preset < 0 && u->fragment_path) + u->unit_file_preset = unit_file_query_preset( + u->manager->unit_file_scope, + NULL, + basename(u->fragment_path), + NULL); + + return u->unit_file_preset; +} + +Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target) { + assert(ref); + assert(source); + assert(target); + + if (ref->target) + unit_ref_unset(ref); + + ref->source = source; + ref->target = target; + LIST_PREPEND(refs_by_target, target->refs_by_target, ref); + return target; +} + +void unit_ref_unset(UnitRef *ref) { + assert(ref); + + if (!ref->target) + return; + + /* We are about to drop a reference to the unit, make sure the garbage collection has a look at it as it might + * be unreferenced now. */ + unit_add_to_gc_queue(ref->target); + + LIST_REMOVE(refs_by_target, ref->target->refs_by_target, ref); + ref->source = ref->target = NULL; +} + +static int user_from_unit_name(Unit *u, char **ret) { + + static const uint8_t hash_key[] = { + 0x58, 0x1a, 0xaf, 0xe6, 0x28, 0x58, 0x4e, 0x96, + 0xb4, 0x4e, 0xf5, 0x3b, 0x8c, 0x92, 0x07, 0xec + }; + + _cleanup_free_ char *n = NULL; + int r; + + r = unit_name_to_prefix(u->id, &n); + if (r < 0) + return r; + + if (valid_user_group_name(n, 0)) { + *ret = TAKE_PTR(n); + return 0; + } + + /* If we can't use the unit name as a user name, then let's hash it and use that */ + if (asprintf(ret, "_du%016" PRIx64, siphash24(n, strlen(n), hash_key)) < 0) + return -ENOMEM; + + return 0; +} + +int unit_patch_contexts(Unit *u) { + CGroupContext *cc; + ExecContext *ec; + int r; + + assert(u); + + /* Patch in the manager defaults into the exec and cgroup + * contexts, _after_ the rest of the settings have been + * initialized */ + + ec = unit_get_exec_context(u); + if (ec) { + /* This only copies in the ones that need memory */ + for (unsigned i = 0; i < _RLIMIT_MAX; i++) + if (u->manager->rlimit[i] && !ec->rlimit[i]) { + ec->rlimit[i] = newdup(struct rlimit, u->manager->rlimit[i], 1); + if (!ec->rlimit[i]) + return -ENOMEM; + } + + if (MANAGER_IS_USER(u->manager) && + !ec->working_directory) { + + r = get_home_dir(&ec->working_directory); + if (r < 0) + return r; + + /* Allow user services to run, even if the + * home directory is missing */ + ec->working_directory_missing_ok = true; + } + + if (ec->private_devices) + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); + + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + + if (ec->protect_kernel_logs) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYSLOG); + + if (ec->protect_clock) + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_SYS_TIME) | (UINT64_C(1) << CAP_WAKE_ALARM)); + + if (ec->dynamic_user) { + if (!ec->user) { + r = user_from_unit_name(u, &ec->user); + if (r < 0) + return r; + } + + if (!ec->group) { + ec->group = strdup(ec->user); + if (!ec->group) + return -ENOMEM; + } + + /* If the dynamic user option is on, let's make sure that the unit can't leave its + * UID/GID around in the file system or on IPC objects. Hence enforce a strict + * sandbox. */ + + ec->private_tmp = true; + ec->remove_ipc = true; + ec->protect_system = PROTECT_SYSTEM_STRICT; + if (ec->protect_home == PROTECT_HOME_NO) + ec->protect_home = PROTECT_HOME_READ_ONLY; + + /* Make sure this service can neither benefit from SUID/SGID binaries nor create + * them. */ + ec->no_new_privileges = true; + ec->restrict_suid_sgid = true; + } + } + + cc = unit_get_cgroup_context(u); + if (cc && ec) { + + if (ec->private_devices && + cc->device_policy == CGROUP_DEVICE_POLICY_AUTO) + cc->device_policy = CGROUP_DEVICE_POLICY_CLOSED; + + if ((ec->root_image || !LIST_IS_EMPTY(ec->mount_images)) && + (cc->device_policy != CGROUP_DEVICE_POLICY_AUTO || cc->device_allow)) { + const char *p; + + /* When RootImage= or MountImages= is specified, the following devices are touched. */ + FOREACH_STRING(p, "/dev/loop-control", "/dev/mapper/control") { + r = cgroup_add_device_allow(cc, p, "rw"); + if (r < 0) + return r; + } + FOREACH_STRING(p, "block-loop", "block-blkext", "block-device-mapper") { + r = cgroup_add_device_allow(cc, p, "rwm"); + if (r < 0) + return r; + } + + /* Make sure "block-loop" can be resolved, i.e. make sure "loop" shows up in /proc/devices. + * Same for mapper and verity. */ + FOREACH_STRING(p, "modprobe@loop.service", "modprobe@dm_mod.service", "modprobe@dm_verity.service") { + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, p, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } + } + + if (ec->protect_clock) { + r = cgroup_add_device_allow(cc, "char-rtc", "r"); + if (r < 0) + return r; + } + } + + return 0; +} + +ExecContext *unit_get_exec_context(Unit *u) { + size_t offset; + assert(u); + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->exec_context_offset; + if (offset <= 0) + return NULL; + + return (ExecContext*) ((uint8_t*) u + offset); +} + +KillContext *unit_get_kill_context(Unit *u) { + size_t offset; + assert(u); + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->kill_context_offset; + if (offset <= 0) + return NULL; + + return (KillContext*) ((uint8_t*) u + offset); +} + +CGroupContext *unit_get_cgroup_context(Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->cgroup_context_offset; + if (offset <= 0) + return NULL; + + return (CGroupContext*) ((uint8_t*) u + offset); +} + +ExecRuntime *unit_get_exec_runtime(Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->exec_runtime_offset; + if (offset <= 0) + return NULL; + + return *(ExecRuntime**) ((uint8_t*) u + offset); +} + +static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) { + assert(u); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return NULL; + + if (u->transient) /* Redirect drop-ins for transient units always into the transient directory. */ + return u->manager->lookup_paths.transient; + + if (flags & UNIT_PERSISTENT) + return u->manager->lookup_paths.persistent_control; + + if (flags & UNIT_RUNTIME) + return u->manager->lookup_paths.runtime_control; + + return NULL; +} + +char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf) { + char *ret = NULL; + + if (!s) + return NULL; + + /* Escapes the input string as requested. Returns the escaped string. If 'buf' is specified then the allocated + * return buffer pointer is also written to *buf, except if no escaping was necessary, in which case *buf is + * set to NULL, and the input pointer is returned as-is. This means the return value always contains a properly + * escaped version, but *buf when passed only contains a pointer if an allocation was necessary. If *buf is + * not specified, then the return value always needs to be freed. Callers can use this to optimize memory + * allocations. */ + + if (flags & UNIT_ESCAPE_SPECIFIERS) { + ret = specifier_escape(s); + if (!ret) + return NULL; + + s = ret; + } + + if (flags & UNIT_ESCAPE_C) { + char *a; + + a = cescape(s); + free(ret); + if (!a) + return NULL; + + ret = a; + } + + if (buf) { + *buf = ret; + return ret ?: (char*) s; + } + + return ret ?: strdup(s); +} + +char* unit_concat_strv(char **l, UnitWriteFlags flags) { + _cleanup_free_ char *result = NULL; + size_t n = 0, allocated = 0; + char **i; + + /* Takes a list of strings, escapes them, and concatenates them. This may be used to format command lines in a + * way suitable for ExecStart= stanzas */ + + STRV_FOREACH(i, l) { + _cleanup_free_ char *buf = NULL; + const char *p; + size_t a; + char *q; + + p = unit_escape_setting(*i, flags, &buf); + if (!p) + return NULL; + + a = (n > 0) + 1 + strlen(p) + 1; /* separating space + " + entry + " */ + if (!GREEDY_REALLOC(result, allocated, n + a + 1)) + return NULL; + + q = result + n; + if (n > 0) + *(q++) = ' '; + + *(q++) = '"'; + q = stpcpy(q, p); + *(q++) = '"'; + + n += a; + } + + if (!GREEDY_REALLOC(result, allocated, n + 1)) + return NULL; + + result[n] = 0; + + return TAKE_PTR(result); +} + +int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data) { + _cleanup_free_ char *p = NULL, *q = NULL, *escaped = NULL; + const char *dir, *wrapped; + int r; + + assert(u); + assert(name); + assert(data); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return 0; + + data = unit_escape_setting(data, flags, &escaped); + if (!data) + return -ENOMEM; + + /* Prefix the section header. If we are writing this out as transient file, then let's suppress this if the + * previous section header is the same */ + + if (flags & UNIT_PRIVATE) { + if (!UNIT_VTABLE(u)->private_section) + return -EINVAL; + + if (!u->transient_file || u->last_section_private < 0) + data = strjoina("[", UNIT_VTABLE(u)->private_section, "]\n", data); + else if (u->last_section_private == 0) + data = strjoina("\n[", UNIT_VTABLE(u)->private_section, "]\n", data); + } else { + if (!u->transient_file || u->last_section_private < 0) + data = strjoina("[Unit]\n", data); + else if (u->last_section_private > 0) + data = strjoina("\n[Unit]\n", data); + } + + if (u->transient_file) { + /* When this is a transient unit file in creation, then let's not create a new drop-in but instead + * write to the transient unit file. */ + fputs(data, u->transient_file); + + if (!endswith(data, "\n")) + fputc('\n', u->transient_file); + + /* Remember which section we wrote this entry to */ + u->last_section_private = !!(flags & UNIT_PRIVATE); + return 0; + } + + dir = unit_drop_in_dir(u, flags); + if (!dir) + return -EINVAL; + + wrapped = strjoina("# This is a drop-in unit file extension, created via \"systemctl set-property\"\n" + "# or an equivalent operation. Do not edit.\n", + data, + "\n"); + + r = drop_in_file(dir, u->id, 50, name, &p, &q); + if (r < 0) + return r; + + (void) mkdir_p_label(p, 0755); + + /* Make sure the drop-in dir is registered in our path cache. This way we don't need to stupidly + * recreate the cache after every drop-in we write. */ + if (u->manager->unit_path_cache) { + r = set_put_strdup(&u->manager->unit_path_cache, p); + if (r < 0) + return r; + } + + r = write_string_file_atomic_label(q, wrapped); + if (r < 0) + return r; + + r = strv_push(&u->dropin_paths, q); + if (r < 0) + return r; + q = NULL; + + strv_uniq(u->dropin_paths); + + u->dropin_mtime = now(CLOCK_REALTIME); + + return 0; +} + +int unit_write_settingf(Unit *u, UnitWriteFlags flags, const char *name, const char *format, ...) { + _cleanup_free_ char *p = NULL; + va_list ap; + int r; + + assert(u); + assert(name); + assert(format); + + if (UNIT_WRITE_FLAGS_NOOP(flags)) + return 0; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return unit_write_setting(u, flags, name, p); +} + +int unit_make_transient(Unit *u) { + _cleanup_free_ char *path = NULL; + FILE *f; + + assert(u); + + if (!UNIT_VTABLE(u)->can_transient) + return -EOPNOTSUPP; + + (void) mkdir_p_label(u->manager->lookup_paths.transient, 0755); + + path = path_join(u->manager->lookup_paths.transient, u->id); + if (!path) + return -ENOMEM; + + /* Let's open the file we'll write the transient settings into. This file is kept open as long as we are + * creating the transient, and is closed in unit_load(), as soon as we start loading the file. */ + + RUN_WITH_UMASK(0022) { + f = fopen(path, "we"); + if (!f) + return -errno; + } + + safe_fclose(u->transient_file); + u->transient_file = f; + + free_and_replace(u->fragment_path, path); + + u->source_path = mfree(u->source_path); + u->dropin_paths = strv_free(u->dropin_paths); + u->fragment_mtime = u->source_mtime = u->dropin_mtime = 0; + + u->load_state = UNIT_STUB; + u->load_error = 0; + u->transient = true; + + unit_add_to_dbus_queue(u); + unit_add_to_gc_queue(u); + + fputs("# This is a transient unit file, created programmatically via the systemd API. Do not edit.\n", + u->transient_file); + + return 0; +} + +static int log_kill(pid_t pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + /* Don't log about processes marked with brackets, under the assumption that these are temporary processes + only, like for example systemd's own PAM stub process. */ + if (comm && comm[0] == '(') + return 0; + + log_unit_notice(userdata, + "Killing process " PID_FMT " (%s) with signal SIG%s.", + pid, + strna(comm), + signal_to_string(sig)); + + return 1; +} + +static int operation_to_signal(const KillContext *c, KillOperation k, bool *noteworthy) { + assert(c); + + switch (k) { + + case KILL_TERMINATE: + case KILL_TERMINATE_AND_LOG: + *noteworthy = false; + return c->kill_signal; + + case KILL_RESTART: + *noteworthy = false; + return restart_kill_signal(c); + + case KILL_KILL: + *noteworthy = true; + return c->final_kill_signal; + + case KILL_WATCHDOG: + *noteworthy = true; + return c->watchdog_signal; + + default: + assert_not_reached("KillOperation unknown"); + } +} + +int unit_kill_context( + Unit *u, + KillContext *c, + KillOperation k, + pid_t main_pid, + pid_t control_pid, + bool main_pid_alien) { + + bool wait_for_exit = false, send_sighup; + cg_kill_log_func_t log_func = NULL; + int sig, r; + + assert(u); + assert(c); + + /* Kill the processes belonging to this unit, in preparation for shutting the unit down. Returns > 0 + * if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common() + * which is used for user-requested killing of unit processes. */ + + if (c->kill_mode == KILL_NONE) + return 0; + + bool noteworthy; + sig = operation_to_signal(c, k, ¬eworthy); + if (noteworthy) + log_func = log_kill; + + send_sighup = + c->send_sighup && + IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) && + sig != SIGHUP; + + if (main_pid > 0) { + if (log_func) + log_func(main_pid, sig, u); + + r = kill_and_sigcont(main_pid, sig); + if (r < 0 && r != -ESRCH) { + _cleanup_free_ char *comm = NULL; + (void) get_process_comm(main_pid, &comm); + + log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid, strna(comm)); + } else { + if (!main_pid_alien) + wait_for_exit = true; + + if (r != -ESRCH && send_sighup) + (void) kill(main_pid, SIGHUP); + } + } + + if (control_pid > 0) { + if (log_func) + log_func(control_pid, sig, u); + + r = kill_and_sigcont(control_pid, sig); + if (r < 0 && r != -ESRCH) { + _cleanup_free_ char *comm = NULL; + (void) get_process_comm(control_pid, &comm); + + log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid, strna(comm)); + } else { + wait_for_exit = true; + + if (r != -ESRCH && send_sighup) + (void) kill(control_pid, SIGHUP); + } + } + + if (u->cgroup_path && + (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) { + _cleanup_set_free_ Set *pid_set = NULL; + + /* Exclude the main/control pids from being killed via the cgroup */ + pid_set = unit_pid_set(main_pid, control_pid); + if (!pid_set) + return -ENOMEM; + + r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + sig, + CGROUP_SIGCONT|CGROUP_IGNORE_SELF, + pid_set, + log_func, u); + if (r < 0) { + if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT)) + log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", u->cgroup_path); + + } else if (r > 0) { + + /* FIXME: For now, on the legacy hierarchy, we will not wait for the cgroup members to die if + * we are running in a container or if this is a delegation unit, simply because cgroup + * notification is unreliable in these cases. It doesn't work at all in containers, and outside + * of containers it can be confused easily by left-over directories in the cgroup — which + * however should not exist in non-delegated units. On the unified hierarchy that's different, + * there we get proper events. Hence rely on them. */ + + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0 || + (detect_container() == 0 && !unit_cgroup_delegate(u))) + wait_for_exit = true; + + if (send_sighup) { + set_free(pid_set); + + pid_set = unit_pid_set(main_pid, control_pid); + if (!pid_set) + return -ENOMEM; + + (void) cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + SIGHUP, + CGROUP_IGNORE_SELF, + pid_set, + NULL, NULL); + } + } + } + + return wait_for_exit; +} + +int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) { + _cleanup_free_ char *p = NULL; + UnitDependencyInfo di; + int r; + + assert(u); + assert(path); + + /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these paths in + * the unit (from the path to the UnitDependencyInfo structure indicating how to the dependency came to + * be). However, we build a prefix table for all possible prefixes so that new appearing mount units can easily + * determine which units to make themselves a dependency of. */ + + if (!path_is_absolute(path)) + return -EINVAL; + + r = hashmap_ensure_allocated(&u->requires_mounts_for, &path_hash_ops); + if (r < 0) + return r; + + p = strdup(path); + if (!p) + return -ENOMEM; + + path = path_simplify(p, true); + + if (!path_is_normalized(path)) + return -EPERM; + + if (hashmap_contains(u->requires_mounts_for, path)) + return 0; + + di = (UnitDependencyInfo) { + .origin_mask = mask + }; + + r = hashmap_put(u->requires_mounts_for, path, di.data); + if (r < 0) + return r; + p = NULL; + + char prefix[strlen(path) + 1]; + PATH_FOREACH_PREFIX_MORE(prefix, path) { + Set *x; + + x = hashmap_get(u->manager->units_requiring_mounts_for, prefix); + if (!x) { + _cleanup_free_ char *q = NULL; + + r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops); + if (r < 0) + return r; + + q = strdup(prefix); + if (!q) + return -ENOMEM; + + x = set_new(NULL); + if (!x) + return -ENOMEM; + + r = hashmap_put(u->manager->units_requiring_mounts_for, q, x); + if (r < 0) { + set_free(x); + return r; + } + q = NULL; + } + + r = set_put(x, u); + if (r < 0) + return r; + } + + return 0; +} + +int unit_setup_exec_runtime(Unit *u) { + ExecRuntime **rt; + size_t offset; + Unit *other; + void *v; + int r; + + offset = UNIT_VTABLE(u)->exec_runtime_offset; + assert(offset > 0); + + /* Check if there already is an ExecRuntime for this unit? */ + rt = (ExecRuntime**) ((uint8_t*) u + offset); + if (*rt) + return 0; + + /* Try to get it from somebody else */ + HASHMAP_FOREACH_KEY(v, other, u->dependencies[UNIT_JOINS_NAMESPACE_OF]) { + r = exec_runtime_acquire(u->manager, NULL, other->id, false, rt); + if (r == 1) + return 1; + } + + return exec_runtime_acquire(u->manager, unit_get_exec_context(u), u->id, true, rt); +} + +int unit_setup_dynamic_creds(Unit *u) { + ExecContext *ec; + DynamicCreds *dcreds; + size_t offset; + + assert(u); + + offset = UNIT_VTABLE(u)->dynamic_creds_offset; + assert(offset > 0); + dcreds = (DynamicCreds*) ((uint8_t*) u + offset); + + ec = unit_get_exec_context(u); + assert(ec); + + if (!ec->dynamic_user) + return 0; + + return dynamic_creds_acquire(dcreds, u->manager, ec->user, ec->group); +} + +bool unit_type_supported(UnitType t) { + if (_unlikely_(t < 0)) + return false; + if (_unlikely_(t >= _UNIT_TYPE_MAX)) + return false; + + if (!unit_vtable[t]->supported) + return true; + + return unit_vtable[t]->supported(); +} + +void unit_warn_if_dir_nonempty(Unit *u, const char* where) { + int r; + + assert(u); + assert(where); + + r = dir_is_empty(where); + if (r > 0 || r == -ENOTDIR) + return; + if (r < 0) { + log_unit_warning_errno(u, r, "Failed to check directory %s: %m", where); + return; + } + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where), + "WHERE=%s", where); +} + +int unit_fail_if_noncanonical(Unit *u, const char* where) { + _cleanup_free_ char *canonical_where = NULL; + int r; + + assert(u); + assert(where); + + r = chase_symlinks(where, NULL, CHASE_NONEXISTENT, &canonical_where, NULL); + if (r < 0) { + log_unit_debug_errno(u, r, "Failed to check %s for symlinks, ignoring: %m", where); + return 0; + } + + /* We will happily ignore a trailing slash (or any redundant slashes) */ + if (path_equal(where, canonical_where)) + return 0; + + /* No need to mention "." or "..", they would already have been rejected by unit_name_from_path() */ + log_struct(LOG_ERR, + "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Mount path %s is not canonical (contains a symlink).", where), + "WHERE=%s", where); + + return -ELOOP; +} + +bool unit_is_pristine(Unit *u) { + assert(u); + + /* Check if the unit already exists or is already around, + * in a number of different ways. Note that to cater for unit + * types such as slice, we are generally fine with units that + * are marked UNIT_LOADED even though nothing was actually + * loaded, as those unit types don't require a file on disk. */ + + return !(!IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_LOADED) || + u->fragment_path || + u->source_path || + !strv_isempty(u->dropin_paths) || + u->job || + u->merged_into); +} + +pid_t unit_control_pid(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->control_pid) + return UNIT_VTABLE(u)->control_pid(u); + + return 0; +} + +pid_t unit_main_pid(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->main_pid) + return UNIT_VTABLE(u)->main_pid(u); + + return 0; +} + +static void unit_unref_uid_internal( + Unit *u, + uid_t *ref_uid, + bool destroy_now, + void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) { + + assert(u); + assert(ref_uid); + assert(_manager_unref_uid); + + /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and + * gid_t are actually the same time, with the same validity rules. + * + * Drops a reference to UID/GID from a unit. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (!uid_is_valid(*ref_uid)) + return; + + _manager_unref_uid(u->manager, *ref_uid, destroy_now); + *ref_uid = UID_INVALID; +} + +static void unit_unref_uid(Unit *u, bool destroy_now) { + unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid); +} + +static void unit_unref_gid(Unit *u, bool destroy_now) { + unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid); +} + +void unit_unref_uid_gid(Unit *u, bool destroy_now) { + assert(u); + + unit_unref_uid(u, destroy_now); + unit_unref_gid(u, destroy_now); +} + +static int unit_ref_uid_internal( + Unit *u, + uid_t *ref_uid, + uid_t uid, + bool clean_ipc, + int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) { + + int r; + + assert(u); + assert(ref_uid); + assert(uid_is_valid(uid)); + assert(_manager_ref_uid); + + /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t + * are actually the same type, and have the same validity rules. + * + * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a + * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter + * drops to zero. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (*ref_uid == uid) + return 0; + + if (uid_is_valid(*ref_uid)) /* Already set? */ + return -EBUSY; + + r = _manager_ref_uid(u->manager, uid, clean_ipc); + if (r < 0) + return r; + + *ref_uid = uid; + return 1; +} + +static int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) { + return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid); +} + +static int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) { + return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid); +} + +static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) { + int r = 0, q = 0; + + assert(u); + + /* Reference both a UID and a GID in one go. Either references both, or neither. */ + + if (uid_is_valid(uid)) { + r = unit_ref_uid(u, uid, clean_ipc); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + q = unit_ref_gid(u, gid, clean_ipc); + if (q < 0) { + if (r > 0) + unit_unref_uid(u, false); + + return q; + } + } + + return r > 0 || q > 0; +} + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) { + ExecContext *c; + int r; + + assert(u); + + c = unit_get_exec_context(u); + + r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false); + if (r < 0) + return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m"); + + return r; +} + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) { + int r; + + assert(u); + + /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names + * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC + * objects when no service references the UID/GID anymore. */ + + r = unit_ref_uid_gid(u, uid, gid); + if (r > 0) + unit_add_to_dbus_queue(u); +} + +int unit_acquire_invocation_id(Unit *u) { + sd_id128_t id; + int r; + + assert(u); + + r = sd_id128_randomize(&id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m"); + + r = unit_set_invocation_id(u, id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m"); + + unit_add_to_dbus_queue(u); + return 0; +} + +int unit_set_exec_params(Unit *u, ExecParameters *p) { + int r; + + assert(u); + assert(p); + + /* Copy parameters from manager */ + r = manager_get_effective_environment(u->manager, &p->environment); + if (r < 0) + return r; + + p->confirm_spawn = manager_get_confirm_spawn(u->manager); + p->cgroup_supported = u->manager->cgroup_supported; + p->prefix = u->manager->prefix; + SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager)); + + /* Copy parameters from unit */ + p->cgroup_path = u->cgroup_path; + SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u)); + + p->received_credentials = u->manager->received_credentials; + + return 0; +} + +int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret) { + int r; + + assert(u); + assert(ret); + + /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child, + * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */ + + (void) unit_realize_cgroup(u); + + r = safe_fork(name, FORK_REOPEN_LOG, ret); + if (r != 0) + return r; + + (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE, -1); + (void) ignore_signals(SIGPIPE, -1); + + (void) prctl(PR_SET_PDEATHSIG, SIGTERM); + + if (u->cgroup_path) { + r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", u->cgroup_path); + _exit(EXIT_CGROUP); + } + } + + return 0; +} + +int unit_fork_and_watch_rm_rf(Unit *u, char **paths, pid_t *ret_pid) { + pid_t pid; + int r; + + assert(u); + assert(ret_pid); + + r = unit_fork_helper_process(u, "(sd-rmrf)", &pid); + if (r < 0) + return r; + if (r == 0) { + int ret = EXIT_SUCCESS; + char **i; + + STRV_FOREACH(i, paths) { + r = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK); + if (r < 0) { + log_error_errno(r, "Failed to remove '%s': %m", *i); + ret = EXIT_FAILURE; + } + } + + _exit(ret); + } + + r = unit_watch_pid(u, pid, true); + if (r < 0) + return r; + + *ret_pid = pid; + return 0; +} + +static void unit_update_dependency_mask(Unit *u, UnitDependency d, Unit *other, UnitDependencyInfo di) { + assert(u); + assert(d >= 0); + assert(d < _UNIT_DEPENDENCY_MAX); + assert(other); + + if (di.origin_mask == 0 && di.destination_mask == 0) { + /* No bit set anymore, let's drop the whole entry */ + assert_se(hashmap_remove(u->dependencies[d], other)); + log_unit_debug(u, "lost dependency %s=%s", unit_dependency_to_string(d), other->id); + } else + /* Mask was reduced, let's update the entry */ + assert_se(hashmap_update(u->dependencies[d], other, di.data) == 0); +} + +void unit_remove_dependencies(Unit *u, UnitDependencyMask mask) { + assert(u); + + /* Removes all dependencies u has on other units marked for ownership by 'mask'. */ + + if (mask == 0) + return; + + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + bool done; + + do { + UnitDependencyInfo di; + Unit *other; + + done = true; + + HASHMAP_FOREACH_KEY(di.data, other, u->dependencies[d]) { + if (FLAGS_SET(~mask, di.origin_mask)) + continue; + di.origin_mask &= ~mask; + unit_update_dependency_mask(u, d, other, di); + + /* We updated the dependency from our unit to the other unit now. But most dependencies + * imply a reverse dependency. Hence, let's delete that one too. For that we go through + * all dependency types on the other unit and delete all those which point to us and + * have the right mask set. */ + + for (UnitDependency q = 0; q < _UNIT_DEPENDENCY_MAX; q++) { + UnitDependencyInfo dj; + + dj.data = hashmap_get(other->dependencies[q], u); + if (FLAGS_SET(~mask, dj.destination_mask)) + continue; + dj.destination_mask &= ~mask; + + unit_update_dependency_mask(other, q, u, dj); + } + + unit_add_to_gc_queue(other); + + done = false; + break; + } + + } while (!done); + } +} + +static int unit_get_invocation_path(Unit *u, char **ret) { + char *p; + int r; + + assert(u); + assert(ret); + + if (MANAGER_IS_SYSTEM(u->manager)) + p = strjoin("/run/systemd/units/invocation:", u->id); + else { + _cleanup_free_ char *user_path = NULL; + r = xdg_user_runtime_dir(&user_path, "/systemd/units/invocation:"); + if (r < 0) + return r; + p = strjoin(user_path, u->id); + } + + if (!p) + return -ENOMEM; + + *ret = p; + return 0; +} + +static int unit_export_invocation_id(Unit *u) { + _cleanup_free_ char *p = NULL; + int r; + + assert(u); + + if (u->exported_invocation_id) + return 0; + + if (sd_id128_is_null(u->invocation_id)) + return 0; + + r = unit_get_invocation_path(u, &p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to get invocation path: %m"); + + r = symlink_atomic_label(u->invocation_id_string, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create invocation ID symlink %s: %m", p); + + u->exported_invocation_id = true; + return 0; +} + +static int unit_export_log_level_max(Unit *u, const ExecContext *c) { + const char *p; + char buf[2]; + int r; + + assert(u); + assert(c); + + if (u->exported_log_level_max) + return 0; + + if (c->log_level_max < 0) + return 0; + + assert(c->log_level_max <= 7); + + buf[0] = '0' + c->log_level_max; + buf[1] = 0; + + p = strjoina("/run/systemd/units/log-level-max:", u->id); + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create maximum log level symlink %s: %m", p); + + u->exported_log_level_max = true; + return 0; +} + +static int unit_export_log_extra_fields(Unit *u, const ExecContext *c) { + _cleanup_close_ int fd = -1; + struct iovec *iovec; + const char *p; + char *pattern; + le64_t *sizes; + ssize_t n; + int r; + + if (u->exported_log_extra_fields) + return 0; + + if (c->n_log_extra_fields <= 0) + return 0; + + sizes = newa(le64_t, c->n_log_extra_fields); + iovec = newa(struct iovec, c->n_log_extra_fields * 2); + + for (size_t i = 0; i < c->n_log_extra_fields; i++) { + sizes[i] = htole64(c->log_extra_fields[i].iov_len); + + iovec[i*2] = IOVEC_MAKE(sizes + i, sizeof(le64_t)); + iovec[i*2+1] = c->log_extra_fields[i]; + } + + p = strjoina("/run/systemd/units/log-extra-fields:", u->id); + pattern = strjoina(p, ".XXXXXX"); + + fd = mkostemp_safe(pattern); + if (fd < 0) + return log_unit_debug_errno(u, fd, "Failed to create extra fields file %s: %m", p); + + n = writev(fd, iovec, c->n_log_extra_fields*2); + if (n < 0) { + r = log_unit_debug_errno(u, errno, "Failed to write extra fields: %m"); + goto fail; + } + + (void) fchmod(fd, 0644); + + if (rename(pattern, p) < 0) { + r = log_unit_debug_errno(u, errno, "Failed to rename extra fields file: %m"); + goto fail; + } + + u->exported_log_extra_fields = true; + return 0; + +fail: + (void) unlink(pattern); + return r; +} + +static int unit_export_log_ratelimit_interval(Unit *u, const ExecContext *c) { + _cleanup_free_ char *buf = NULL; + const char *p; + int r; + + assert(u); + assert(c); + + if (u->exported_log_ratelimit_interval) + return 0; + + if (c->log_ratelimit_interval_usec == 0) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id); + + if (asprintf(&buf, "%" PRIu64, c->log_ratelimit_interval_usec) < 0) + return log_oom(); + + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create log rate limit interval symlink %s: %m", p); + + u->exported_log_ratelimit_interval = true; + return 0; +} + +static int unit_export_log_ratelimit_burst(Unit *u, const ExecContext *c) { + _cleanup_free_ char *buf = NULL; + const char *p; + int r; + + assert(u); + assert(c); + + if (u->exported_log_ratelimit_burst) + return 0; + + if (c->log_ratelimit_burst == 0) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id); + + if (asprintf(&buf, "%u", c->log_ratelimit_burst) < 0) + return log_oom(); + + r = symlink_atomic(buf, p); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to create log rate limit burst symlink %s: %m", p); + + u->exported_log_ratelimit_burst = true; + return 0; +} + +void unit_export_state_files(Unit *u) { + const ExecContext *c; + + assert(u); + + if (!u->id) + return; + + if (MANAGER_IS_TEST_RUN(u->manager)) + return; + + /* Exports a couple of unit properties to /run/systemd/units/, so that journald can quickly query this data + * from there. Ideally, journald would use IPC to query this, like everybody else, but that's hard, as long as + * the IPC system itself and PID 1 also log to the journal. + * + * Note that these files really shouldn't be considered API for anyone else, as use a runtime file system as + * IPC replacement is not compatible with today's world of file system namespaces. However, this doesn't really + * apply to communication between the journal and systemd, as we assume that these two daemons live in the same + * namespace at least. + * + * Note that some of the "files" exported here are actually symlinks and not regular files. Symlinks work + * better for storing small bits of data, in particular as we can write them with two system calls, and read + * them with one. */ + + (void) unit_export_invocation_id(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + c = unit_get_exec_context(u); + if (c) { + (void) unit_export_log_level_max(u, c); + (void) unit_export_log_extra_fields(u, c); + (void) unit_export_log_ratelimit_interval(u, c); + (void) unit_export_log_ratelimit_burst(u, c); + } +} + +void unit_unlink_state_files(Unit *u) { + const char *p; + + assert(u); + + if (!u->id) + return; + + /* Undoes the effect of unit_export_state() */ + + if (u->exported_invocation_id) { + _cleanup_free_ char *invocation_path = NULL; + int r = unit_get_invocation_path(u, &invocation_path); + if (r >= 0) { + (void) unlink(invocation_path); + u->exported_invocation_id = false; + } + } + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (u->exported_log_level_max) { + p = strjoina("/run/systemd/units/log-level-max:", u->id); + (void) unlink(p); + + u->exported_log_level_max = false; + } + + if (u->exported_log_extra_fields) { + p = strjoina("/run/systemd/units/extra-fields:", u->id); + (void) unlink(p); + + u->exported_log_extra_fields = false; + } + + if (u->exported_log_ratelimit_interval) { + p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id); + (void) unlink(p); + + u->exported_log_ratelimit_interval = false; + } + + if (u->exported_log_ratelimit_burst) { + p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id); + (void) unlink(p); + + u->exported_log_ratelimit_burst = false; + } +} + +int unit_prepare_exec(Unit *u) { + int r; + + assert(u); + + /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable. + * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */ + r = bpf_firewall_load_custom(u); + if (r < 0) + return r; + + /* Prepares everything so that we can fork of a process for this unit */ + + (void) unit_realize_cgroup(u); + + if (u->reset_accounting) { + (void) unit_reset_accounting(u); + u->reset_accounting = false; + } + + unit_export_state_files(u); + + r = unit_setup_exec_runtime(u); + if (r < 0) + return r; + + r = unit_setup_dynamic_creds(u); + if (r < 0) + return r; + + return 0; +} + +static bool ignore_leftover_process(const char *comm) { + return comm && comm[0] == '('; /* Most likely our own helper process (PAM?), ignore */ +} + +int unit_log_leftover_process_start(pid_t pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + if (ignore_leftover_process(comm)) + return 0; + + /* During start we print a warning */ + + log_unit_warning(userdata, + "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n" + "This usually indicates unclean termination of a previous run, or service implementation deficiencies.", + pid, strna(comm)); + + return 1; +} + +int unit_log_leftover_process_stop(pid_t pid, int sig, void *userdata) { + _cleanup_free_ char *comm = NULL; + + (void) get_process_comm(pid, &comm); + + if (ignore_leftover_process(comm)) + return 0; + + /* During stop we only print an informational message */ + + log_unit_info(userdata, + "Unit process " PID_FMT " (%s) remains running after unit stopped.", + pid, strna(comm)); + + return 1; +} + +int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) { + assert(u); + + (void) unit_pick_cgroup_path(u); + + if (!u->cgroup_path) + return 0; + + return cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, 0, 0, NULL, log_func, u); +} + +bool unit_needs_console(Unit *u) { + ExecContext *ec; + UnitActiveState state; + + assert(u); + + state = unit_active_state(u); + + if (UNIT_IS_INACTIVE_OR_FAILED(state)) + return false; + + if (UNIT_VTABLE(u)->needs_console) + return UNIT_VTABLE(u)->needs_console(u); + + /* If this unit type doesn't implement this call, let's use a generic fallback implementation: */ + ec = unit_get_exec_context(u); + if (!ec) + return false; + + return exec_context_may_touch_console(ec); +} + +const char *unit_label_path(const Unit *u) { + const char *p; + + assert(u); + + /* Returns the file system path to use for MAC access decisions, i.e. the file to read the SELinux label off + * when validating access checks. */ + + p = u->source_path ?: u->fragment_path; + if (!p) + return NULL; + + /* If a unit is masked, then don't read the SELinux label of /dev/null, as that really makes no sense */ + if (null_or_empty_path(p) > 0) + return NULL; + + return p; +} + +int unit_pid_attachable(Unit *u, pid_t pid, sd_bus_error *error) { + int r; + + assert(u); + + /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself, + * and not a kernel thread either */ + + /* First, a simple range check */ + if (!pid_is_valid(pid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier " PID_FMT " is not valid.", pid); + + /* Some extra safety check */ + if (pid == 1 || pid == getpid_cached()) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager process, refusing.", pid); + + /* Don't even begin to bother with kernel threads */ + r = is_kernel_thread(pid); + if (r == -ESRCH) + return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid); + if (r > 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid); + + return 0; +} + +void unit_log_success(Unit *u) { + assert(u); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_SUCCESS_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Succeeded.")); +} + +void unit_log_failure(Unit *u, const char *result) { + assert(u); + assert(result); + + log_struct(LOG_WARNING, + "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILURE_RESULT_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Failed with result '%s'.", result), + "UNIT_RESULT=%s", result); +} + +void unit_log_skip(Unit *u, const char *result) { + assert(u); + assert(result); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_UNIT_SKIPPED_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "Skipped due to '%s'.", result), + "UNIT_RESULT=%s", result); +} + +void unit_log_process_exit( + Unit *u, + const char *kind, + const char *command, + bool success, + int code, + int status) { + + int level; + + assert(u); + assert(kind); + + /* If this is a successful exit, let's log about the exit code on DEBUG level. If this is a failure + * and the process exited on its own via exit(), then let's make this a NOTICE, under the assumption + * that the service already logged the reason at a higher log level on its own. Otherwise, make it a + * WARNING. */ + if (success) + level = LOG_DEBUG; + else if (code == CLD_EXITED) + level = LOG_NOTICE; + else + level = LOG_WARNING; + + log_struct(level, + "MESSAGE_ID=" SD_MESSAGE_UNIT_PROCESS_EXIT_STR, + LOG_UNIT_MESSAGE(u, "%s exited, code=%s, status=%i/%s", + kind, + sigchld_code_to_string(code), status, + strna(code == CLD_EXITED + ? exit_status_to_string(status, EXIT_STATUS_FULL) + : signal_to_string(status))), + "EXIT_CODE=%s", sigchld_code_to_string(code), + "EXIT_STATUS=%i", status, + "COMMAND=%s", strna(command), + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u)); +} + +int unit_exit_status(Unit *u) { + assert(u); + + /* Returns the exit status to propagate for the most recent cycle of this unit. Returns a value in the range + * 0…255 if there's something to propagate. EOPNOTSUPP if the concept does not apply to this unit type, ENODATA + * if no data is currently known (for example because the unit hasn't deactivated yet) and EBADE if the main + * service process has exited abnormally (signal/coredump). */ + + if (!UNIT_VTABLE(u)->exit_status) + return -EOPNOTSUPP; + + return UNIT_VTABLE(u)->exit_status(u); +} + +int unit_failure_action_exit_status(Unit *u) { + int r; + + assert(u); + + /* Returns the exit status to propagate on failure, or an error if there's nothing to propagate */ + + if (u->failure_action_exit_status >= 0) + return u->failure_action_exit_status; + + r = unit_exit_status(u); + if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */ + return 255; + + return r; +} + +int unit_success_action_exit_status(Unit *u) { + int r; + + assert(u); + + /* Returns the exit status to propagate on success, or an error if there's nothing to propagate */ + + if (u->success_action_exit_status >= 0) + return u->success_action_exit_status; + + r = unit_exit_status(u); + if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */ + return 255; + + return r; +} + +int unit_test_trigger_loaded(Unit *u) { + Unit *trigger; + + /* Tests whether the unit to trigger is loaded */ + + trigger = UNIT_TRIGGER(u); + if (!trigger) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), + "Refusing to start, no unit to trigger."); + if (trigger->load_state != UNIT_LOADED) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), + "Refusing to start, unit %s to trigger not loaded.", trigger->id); + + return 0; +} + +void unit_destroy_runtime_data(Unit *u, const ExecContext *context) { + assert(u); + assert(context); + + if (context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO || + (context->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART && !unit_will_restart(u))) + exec_context_destroy_runtime_directory(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]); + + exec_context_destroy_credentials(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id); +} + +int unit_clean(Unit *u, ExecCleanMask mask) { + UnitActiveState state; + + assert(u); + + /* Special return values: + * + * -EOPNOTSUPP → cleaning not supported for this unit type + * -EUNATCH → cleaning not defined for this resource type + * -EBUSY → unit currently can't be cleaned since it's running or not properly loaded, or has + * a job queued or similar + */ + + if (!UNIT_VTABLE(u)->clean) + return -EOPNOTSUPP; + + if (mask == 0) + return -EUNATCH; + + if (u->load_state != UNIT_LOADED) + return -EBUSY; + + if (u->job) + return -EBUSY; + + state = unit_active_state(u); + if (!IN_SET(state, UNIT_INACTIVE)) + return -EBUSY; + + return UNIT_VTABLE(u)->clean(u, mask); +} + +int unit_can_clean(Unit *u, ExecCleanMask *ret) { + assert(u); + + if (!UNIT_VTABLE(u)->clean || + u->load_state != UNIT_LOADED) { + *ret = 0; + return 0; + } + + /* When the clean() method is set, can_clean() really should be set too */ + assert(UNIT_VTABLE(u)->can_clean); + + return UNIT_VTABLE(u)->can_clean(u, ret); +} + +bool unit_can_freeze(Unit *u) { + assert(u); + + if (UNIT_VTABLE(u)->can_freeze) + return UNIT_VTABLE(u)->can_freeze(u); + + return UNIT_VTABLE(u)->freeze; +} + +void unit_frozen(Unit *u) { + assert(u); + + u->freezer_state = FREEZER_FROZEN; + + bus_unit_send_pending_freezer_message(u); +} + +void unit_thawed(Unit *u) { + assert(u); + + u->freezer_state = FREEZER_RUNNING; + + bus_unit_send_pending_freezer_message(u); +} + +static int unit_freezer_action(Unit *u, FreezerAction action) { + UnitActiveState s; + int (*method)(Unit*); + int r; + + assert(u); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + + method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw; + if (!method || !cg_freezer_supported()) + return -EOPNOTSUPP; + + if (u->job) + return -EBUSY; + + if (u->load_state != UNIT_LOADED) + return -EHOSTDOWN; + + s = unit_active_state(u); + if (s != UNIT_ACTIVE) + return -EHOSTDOWN; + + if (IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) + return -EALREADY; + + r = method(u); + if (r <= 0) + return r; + + return 1; +} + +int unit_freeze(Unit *u) { + return unit_freezer_action(u, FREEZER_FREEZE); +} + +int unit_thaw(Unit *u) { + return unit_freezer_action(u, FREEZER_THAW); +} + +/* Wrappers around low-level cgroup freezer operations common for service and scope units */ +int unit_freeze_vtable_common(Unit *u) { + return unit_cgroup_freezer_action(u, FREEZER_FREEZE); +} + +int unit_thaw_vtable_common(Unit *u) { + return unit_cgroup_freezer_action(u, FREEZER_THAW); +} + +static const char* const collect_mode_table[_COLLECT_MODE_MAX] = { + [COLLECT_INACTIVE] = "inactive", + [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed", +}; + +DEFINE_STRING_TABLE_LOOKUP(collect_mode, CollectMode); diff --git a/src/core/unit.h b/src/core/unit.h new file mode 100644 index 0000000..02b2b24 --- /dev/null +++ b/src/core/unit.h @@ -0,0 +1,944 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include <stdlib.h> +#include <unistd.h> + +#include "bpf-program.h" +#include "condition.h" +#include "emergency-action.h" +#include "list.h" +#include "show-status.h" +#include "set.h" +#include "unit-file.h" +#include "cgroup.h" + +typedef struct UnitRef UnitRef; + +typedef enum KillOperation { + KILL_TERMINATE, + KILL_TERMINATE_AND_LOG, + KILL_RESTART, + KILL_KILL, + KILL_WATCHDOG, + _KILL_OPERATION_MAX, + _KILL_OPERATION_INVALID = -1 +} KillOperation; + +typedef enum CollectMode { + COLLECT_INACTIVE, + COLLECT_INACTIVE_OR_FAILED, + _COLLECT_MODE_MAX, + _COLLECT_MODE_INVALID = -1, +} CollectMode; + +static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) { + return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING); +} + +static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) { + return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING); +} + +static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) { + return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING); +} + +static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) { + return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED); +} + +static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) { + return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED; +} + +/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We + * use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be + * created as a result of multiple "reasons", hence the bitmask. */ +typedef enum UnitDependencyMask { + /* Configured directly by the unit file, .wants/.requires symlink or drop-in, or as an immediate result of a + * non-dependency option configured that way. */ + UNIT_DEPENDENCY_FILE = 1 << 0, + + /* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and + * type) */ + UNIT_DEPENDENCY_IMPLICIT = 1 << 1, + + /* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually + * just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only + * be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */ + UNIT_DEPENDENCY_DEFAULT = 1 << 2, + + /* A dependency created from udev rules */ + UNIT_DEPENDENCY_UDEV = 1 << 3, + + /* A dependency created because of some unit's RequiresMountsFor= setting */ + UNIT_DEPENDENCY_PATH = 1 << 4, + + /* A dependency created because of data read from /proc/self/mountinfo and no other configuration source */ + UNIT_DEPENDENCY_MOUNTINFO_IMPLICIT = 1 << 5, + + /* A dependency created because of data read from /proc/self/mountinfo, but conditionalized by + * DefaultDependencies= and thus also involving configuration from UNIT_DEPENDENCY_FILE sources */ + UNIT_DEPENDENCY_MOUNTINFO_DEFAULT = 1 << 6, + + /* A dependency created because of data read from /proc/swaps and no other configuration source */ + UNIT_DEPENDENCY_PROC_SWAP = 1 << 7, + + _UNIT_DEPENDENCY_MASK_FULL = (1 << 8) - 1, +} UnitDependencyMask; + +/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can + * be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin + * and the destination of a dependency might have created it. */ +typedef union UnitDependencyInfo { + void *data; + struct { + UnitDependencyMask origin_mask:16; + UnitDependencyMask destination_mask:16; + } _packed_; +} UnitDependencyInfo; + +#include "job.h" + +struct UnitRef { + /* Keeps tracks of references to a unit. This is useful so + * that we can merge two units if necessary and correct all + * references to them */ + + Unit *source, *target; + LIST_FIELDS(UnitRef, refs_by_target); +}; + +typedef struct Unit { + Manager *manager; + + UnitType type; + UnitLoadState load_state; + Unit *merged_into; + + FreezerState freezer_state; + sd_bus_message *pending_freezer_message; + + char *id; /* The one special name that we use for identification */ + char *instance; + + Set *aliases; /* All the other names. */ + + /* For each dependency type we maintain a Hashmap whose key is the Unit* object, and the value encodes why the + * dependency exists, using the UnitDependencyInfo type */ + Hashmap *dependencies[_UNIT_DEPENDENCY_MAX]; + + /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the UnitDependencyInfo type */ + Hashmap *requires_mounts_for; + + char *description; + char **documentation; + + char *fragment_path; /* if loaded from a config file this is the primary path to it */ + char *source_path; /* if converted, the source file */ + char **dropin_paths; + + usec_t fragment_not_found_timestamp_hash; + usec_t fragment_mtime; + usec_t source_mtime; + usec_t dropin_mtime; + + /* If this is a transient unit we are currently writing, this is where we are writing it to */ + FILE *transient_file; + + /* If there is something to do with this unit, then this is the installed job for it */ + Job *job; + + /* JOB_NOP jobs are special and can be installed without disturbing the real job. */ + Job *nop_job; + + /* The slot used for watching NameOwnerChanged signals */ + sd_bus_slot *match_bus_slot; + sd_bus_slot *get_name_owner_slot; + + /* References to this unit from clients */ + sd_bus_track *bus_track; + char **deserialized_refs; + + /* Job timeout and action to take */ + usec_t job_timeout; + usec_t job_running_timeout; + bool job_running_timeout_set:1; + EmergencyAction job_timeout_action; + char *job_timeout_reboot_arg; + + /* References to this */ + LIST_HEAD(UnitRef, refs_by_target); + + /* Conditions to check */ + LIST_HEAD(Condition, conditions); + LIST_HEAD(Condition, asserts); + + dual_timestamp condition_timestamp; + dual_timestamp assert_timestamp; + + /* Updated whenever the low-level state changes */ + dual_timestamp state_change_timestamp; + + /* Updated whenever the (high-level) active state enters or leaves the active or inactive states */ + dual_timestamp inactive_exit_timestamp; + dual_timestamp active_enter_timestamp; + dual_timestamp active_exit_timestamp; + dual_timestamp inactive_enter_timestamp; + + UnitRef slice; + + /* Per type list */ + LIST_FIELDS(Unit, units_by_type); + + /* Load queue */ + LIST_FIELDS(Unit, load_queue); + + /* D-Bus queue */ + LIST_FIELDS(Unit, dbus_queue); + + /* Cleanup queue */ + LIST_FIELDS(Unit, cleanup_queue); + + /* GC queue */ + LIST_FIELDS(Unit, gc_queue); + + /* CGroup realize members queue */ + LIST_FIELDS(Unit, cgroup_realize_queue); + + /* cgroup empty queue */ + LIST_FIELDS(Unit, cgroup_empty_queue); + + /* cgroup OOM queue */ + LIST_FIELDS(Unit, cgroup_oom_queue); + + /* Target dependencies queue */ + LIST_FIELDS(Unit, target_deps_queue); + + /* Queue of units with StopWhenUnneeded set that shell be checked for clean-up. */ + LIST_FIELDS(Unit, stop_when_unneeded_queue); + + /* PIDs we keep an eye on. Note that a unit might have many + * more, but these are the ones we care enough about to + * process SIGCHLD for */ + Set *pids; + + /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; + + /* Used during GC sweeps */ + unsigned gc_marker; + + /* Error code when we didn't manage to load the unit (negative) */ + int load_error; + + /* Put a ratelimit on unit starting */ + RateLimit start_ratelimit; + EmergencyAction start_limit_action; + + /* What to do on failure or success */ + EmergencyAction success_action, failure_action; + int success_action_exit_status, failure_action_exit_status; + char *reboot_arg; + + /* Make sure we never enter endless loops with the check unneeded logic, or the BindsTo= logic */ + RateLimit auto_stop_ratelimit; + + /* Reference to a specific UID/GID */ + uid_t ref_uid; + gid_t ref_gid; + + /* Cached unit file state and preset */ + UnitFileState unit_file_state; + int unit_file_preset; + + /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */ + nsec_t cpu_usage_base; + nsec_t cpu_usage_last; /* the most recently read value */ + + /* The current counter of processes sent SIGKILL by systemd-oomd */ + uint64_t managed_oom_kill_last; + + /* The current counter of the oom_kill field in the memory.events cgroup attribute */ + uint64_t oom_kill_last; + + /* Where the io.stat data was at the time the unit was started */ + uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; + uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */ + + /* Counterparts in the cgroup filesystem */ + char *cgroup_path; + CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ + CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ + CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */ + CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ + + /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */ + int cgroup_control_inotify_wd; + int cgroup_memory_inotify_wd; + + /* Device Controller BPF program */ + BPFProgram *bpf_device_control_installed; + + /* IP BPF Firewalling/accounting */ + int ip_accounting_ingress_map_fd; + int ip_accounting_egress_map_fd; + + int ipv4_allow_map_fd; + int ipv6_allow_map_fd; + int ipv4_deny_map_fd; + int ipv6_deny_map_fd; + + BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed; + BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed; + Set *ip_bpf_custom_ingress; + Set *ip_bpf_custom_ingress_installed; + Set *ip_bpf_custom_egress; + Set *ip_bpf_custom_egress_installed; + + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + + /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new + * ones which might have appeared. */ + sd_event_source *rewatch_pids_event_source; + + /* How to start OnFailure units */ + JobMode on_failure_job_mode; + + /* Tweaking the GC logic */ + CollectMode collect_mode; + + /* The current invocation ID */ + sd_id128_t invocation_id; + char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */ + + /* Garbage collect us we nobody wants or requires us anymore */ + bool stop_when_unneeded; + + /* Create default dependencies */ + bool default_dependencies; + + /* Refuse manual starting, allow starting only indirectly via dependency. */ + bool refuse_manual_start; + + /* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */ + bool refuse_manual_stop; + + /* Allow isolation requests */ + bool allow_isolate; + + /* Ignore this unit when isolating */ + bool ignore_on_isolate; + + /* Did the last condition check succeed? */ + bool condition_result; + bool assert_result; + + /* Is this a transient unit? */ + bool transient; + + /* Is this a unit that is always running and cannot be stopped? */ + bool perpetual; + + /* Booleans indicating membership of this unit in the various queues */ + bool in_load_queue:1; + bool in_dbus_queue:1; + bool in_cleanup_queue:1; + bool in_gc_queue:1; + bool in_cgroup_realize_queue:1; + bool in_cgroup_empty_queue:1; + bool in_cgroup_oom_queue:1; + bool in_target_deps_queue:1; + bool in_stop_when_unneeded_queue:1; + + bool sent_dbus_new_signal:1; + + bool in_audit:1; + bool on_console:1; + + bool cgroup_realized:1; + bool cgroup_members_mask_valid:1; + + /* Reset cgroup accounting next time we fork something off */ + bool reset_accounting:1; + + bool start_limit_hit:1; + + /* Did we already invoke unit_coldplug() for this unit? */ + bool coldplugged:1; + + /* For transient units: whether to add a bus track reference after creating the unit */ + bool bus_track_add:1; + + /* Remember which unit state files we created */ + bool exported_invocation_id:1; + bool exported_log_level_max:1; + bool exported_log_extra_fields:1; + bool exported_log_ratelimit_interval:1; + bool exported_log_ratelimit_burst:1; + + /* Whether we warned about clamping the CPU quota period */ + bool warned_clamping_cpu_quota_period:1; + + /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If + * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */ + signed int last_section_private:2; +} Unit; + +typedef struct UnitStatusMessageFormats { + const char *starting_stopping[2]; + const char *finished_start_job[_JOB_RESULT_MAX]; + const char *finished_stop_job[_JOB_RESULT_MAX]; + /* If this entry is present, it'll be called to provide a context-dependent format string, + * or NULL to fall back to finished_{start,stop}_job; if those are NULL too, fall back to generic. */ + const char *(*finished_job)(Unit *u, JobType t, JobResult result); +} UnitStatusMessageFormats; + +/* Flags used when writing drop-in files or transient unit files */ +typedef enum UnitWriteFlags { + /* Write a runtime unit file or drop-in (i.e. one below /run) */ + UNIT_RUNTIME = 1 << 0, + + /* Write a persistent drop-in (i.e. one below /etc) */ + UNIT_PERSISTENT = 1 << 1, + + /* Place this item in the per-unit-type private section, instead of [Unit] */ + UNIT_PRIVATE = 1 << 2, + + /* Apply specifier escaping before writing */ + UNIT_ESCAPE_SPECIFIERS = 1 << 3, + + /* Apply C escaping before writing */ + UNIT_ESCAPE_C = 1 << 4, +} UnitWriteFlags; + +/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */ +static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) { + return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0; +} + +#include "kill.h" + +typedef struct UnitVTable { + /* How much memory does an object of this unit type need */ + size_t object_size; + + /* If greater than 0, the offset into the object where + * ExecContext is found, if the unit type has that */ + size_t exec_context_offset; + + /* If greater than 0, the offset into the object where + * CGroupContext is found, if the unit type has that */ + size_t cgroup_context_offset; + + /* If greater than 0, the offset into the object where + * KillContext is found, if the unit type has that */ + size_t kill_context_offset; + + /* If greater than 0, the offset into the object where the + * pointer to ExecRuntime is found, if the unit type has + * that */ + size_t exec_runtime_offset; + + /* If greater than 0, the offset into the object where the pointer to DynamicCreds is found, if the unit type + * has that. */ + size_t dynamic_creds_offset; + + /* The name of the configuration file section with the private settings of this unit */ + const char *private_section; + + /* Config file sections this unit type understands, separated + * by NUL chars */ + const char *sections; + + /* This should reset all type-specific variables. This should + * not allocate memory, and is called with zero-initialized + * data. It should hence only initialize variables that need + * to be set != 0. */ + void (*init)(Unit *u); + + /* This should free all type-specific variables. It should be + * idempotent. */ + void (*done)(Unit *u); + + /* Actually load data from disk. This may fail, and should set + * load_state to UNIT_LOADED, UNIT_MERGED or leave it at + * UNIT_STUB if no configuration could be found. */ + int (*load)(Unit *u); + + /* During deserialization we only record the intended state to return to. With coldplug() we actually put the + * deserialized state in effect. This is where unit_notify() should be called to start things up. Note that + * this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the + * reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was + * in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's + * what catchup() below is for. */ + int (*coldplug)(Unit *u); + + /* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the + * reloading state. It's supposed to catch up with state changes due to external events we missed so far (for + * example because they took place while we were reloading/reexecing) */ + void (*catchup)(Unit *u); + + void (*dump)(Unit *u, FILE *f, const char *prefix); + + int (*start)(Unit *u); + int (*stop)(Unit *u); + int (*reload)(Unit *u); + + int (*kill)(Unit *u, KillWho w, int signo, sd_bus_error *error); + + /* Clear out the various runtime/state/cache/logs/configuration data */ + int (*clean)(Unit *u, ExecCleanMask m); + + /* Freeze the unit */ + int (*freeze)(Unit *u); + int (*thaw)(Unit *u); + bool (*can_freeze)(Unit *u); + + /* Return which kind of data can be cleaned */ + int (*can_clean)(Unit *u, ExecCleanMask *ret); + + bool (*can_reload)(Unit *u); + + /* Write all data that cannot be restored from other sources + * away using unit_serialize_item() */ + int (*serialize)(Unit *u, FILE *f, FDSet *fds); + + /* Restore one item from the serialization */ + int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds); + + /* Try to match up fds with what we need for this unit */ + void (*distribute_fds)(Unit *u, FDSet *fds); + + /* Boils down the more complex internal state of this unit to + * a simpler one that the engine can understand */ + UnitActiveState (*active_state)(Unit *u); + + /* Returns the substate specific to this unit type as + * string. This is purely information so that we can give the + * user a more fine grained explanation in which actual state a + * unit is in. */ + const char* (*sub_state_to_string)(Unit *u); + + /* Additionally to UnitActiveState determine whether unit is to be restarted. */ + bool (*will_restart)(Unit *u); + + /* Return false when there is a reason to prevent this unit from being gc'ed + * even though nothing references it and it isn't active in any way. */ + bool (*may_gc)(Unit *u); + + /* Return true when the unit is not controlled by the manager (e.g. extrinsic mounts). */ + bool (*is_extrinsic)(Unit *u); + + /* When the unit is not running and no job for it queued we shall release its runtime resources */ + void (*release_resources)(Unit *u); + + /* Invoked on every child that died */ + void (*sigchld_event)(Unit *u, pid_t pid, int code, int status); + + /* Reset failed state if we are in failed state */ + void (*reset_failed)(Unit *u); + + /* Called whenever any of the cgroups this unit watches for ran empty */ + void (*notify_cgroup_empty)(Unit *u); + + /* Called whenever an OOM kill event on this unit was seen */ + void (*notify_cgroup_oom)(Unit *u); + + /* Called whenever a process of this unit sends us a message */ + void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds); + + /* Called whenever a name this Unit registered for comes or goes away. */ + void (*bus_name_owner_change)(Unit *u, const char *new_owner); + + /* Called for each property that is being set */ + int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + + /* Called after at least one property got changed to apply the necessary change */ + int (*bus_commit_properties)(Unit *u); + + /* Return the unit this unit is following */ + Unit *(*following)(Unit *u); + + /* Return the set of units that are following each other */ + int (*following_set)(Unit *u, Set **s); + + /* Invoked each time a unit this unit is triggering changes + * state or gains/loses a job */ + void (*trigger_notify)(Unit *u, Unit *trigger); + + /* Called whenever CLOCK_REALTIME made a jump */ + void (*time_change)(Unit *u); + + /* Called whenever /etc/localtime was modified */ + void (*timezone_change)(Unit *u); + + /* Returns the next timeout of a unit */ + int (*get_timeout)(Unit *u, usec_t *timeout); + + /* Returns the main PID if there is any defined, or 0. */ + pid_t (*main_pid)(Unit *u); + + /* Returns the main PID if there is any defined, or 0. */ + pid_t (*control_pid)(Unit *u); + + /* Returns true if the unit currently needs access to the console */ + bool (*needs_console)(Unit *u); + + /* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the + * exit code of the "main" process of the service or similar. */ + int (*exit_status)(Unit *u); + + /* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that + * unconditionally exist and are always active. The main reason to keep both enumeration functions separate is + * philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those + * discovered through regular enumeration should be put in place by catchup(), see below. */ + void (*enumerate_perpetual)(Manager *m); + + /* This is called for each unit type and should be used to enumerate units already existing in the system + * internally and load them. However, everything that is loaded here should still stay in inactive state. It is + * the job of the catchup() call above to put the units into the discovered state. */ + void (*enumerate)(Manager *m); + + /* Type specific cleanups. */ + void (*shutdown)(Manager *m); + + /* If this function is set and return false all jobs for units + * of this type will immediately fail. */ + bool (*supported)(void); + + /* The strings to print in status messages */ + UnitStatusMessageFormats status_message_formats; + + /* True if transient units of this type are OK */ + bool can_transient:1; + + /* True if cgroup delegation is permissible */ + bool can_delegate:1; + + /* True if the unit type triggers other units, i.e. can have a UNIT_TRIGGERS dependency */ + bool can_trigger:1; + + /* True if the unit type knows a failure state, and thus can be source of an OnFailure= dependency */ + bool can_fail:1; + + /* True if units of this type shall be startable only once and then never again */ + bool once_only:1; + + /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */ + bool gc_jobs:1; + + /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroup(s) */ + bool can_set_managed_oom:1; +} UnitVTable; + +extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX]; + +static inline const UnitVTable* UNIT_VTABLE(Unit *u) { + return unit_vtable[u->type]; +} + +/* For casting a unit into the various unit types */ +#define DEFINE_CAST(UPPERCASE, MixedCase) \ + static inline MixedCase* UPPERCASE(Unit *u) { \ + if (_unlikely_(!u || u->type != UNIT_##UPPERCASE)) \ + return NULL; \ + \ + return (MixedCase*) u; \ + } + +/* For casting the various unit types into a unit */ +#define UNIT(u) \ + ({ \ + typeof(u) _u_ = (u); \ + Unit *_w_ = _u_ ? &(_u_)->meta : NULL; \ + _w_; \ + }) + +#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0) +#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0) +#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0) + +static inline Unit* UNIT_TRIGGER(Unit *u) { + return hashmap_first_key(u->dependencies[UNIT_TRIGGERS]); +} + +Unit *unit_new(Manager *m, size_t size); +void unit_free(Unit *u); +DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free); + +int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret); +int unit_add_name(Unit *u, const char *name); + +int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask); +int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask); + +int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask); +int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask); + +int unit_add_exec_dependencies(Unit *u, ExecContext *c); + +int unit_choose_id(Unit *u, const char *name); +int unit_set_description(Unit *u, const char *description); + +bool unit_may_gc(Unit *u); + +static inline bool unit_is_extrinsic(Unit *u) { + return u->perpetual || + (UNIT_VTABLE(u)->is_extrinsic && UNIT_VTABLE(u)->is_extrinsic(u)); +} + +void unit_add_to_load_queue(Unit *u); +void unit_add_to_dbus_queue(Unit *u); +void unit_add_to_cleanup_queue(Unit *u); +void unit_add_to_gc_queue(Unit *u); +void unit_add_to_target_deps_queue(Unit *u); +void unit_submit_to_stop_when_unneeded_queue(Unit *u); + +int unit_merge(Unit *u, Unit *other); +int unit_merge_by_name(Unit *u, const char *other); + +Unit *unit_follow_merge(Unit *u) _pure_; + +int unit_load_fragment_and_dropin(Unit *u, bool fragment_required); +int unit_load(Unit *unit); + +int unit_set_slice(Unit *u, Unit *slice); +int unit_set_default_slice(Unit *u); + +const char *unit_description(Unit *u) _pure_; +const char *unit_status_string(Unit *u) _pure_; + +bool unit_has_name(const Unit *u, const char *name); + +UnitActiveState unit_active_state(Unit *u); +FreezerState unit_freezer_state(Unit *u); +int unit_freezer_state_kernel(Unit *u, FreezerState *ret); + +const char* unit_sub_state_to_string(Unit *u); + +void unit_dump(Unit *u, FILE *f, const char *prefix); + +bool unit_can_reload(Unit *u) _pure_; +bool unit_can_start(Unit *u) _pure_; +bool unit_can_stop(Unit *u) _pure_; +bool unit_can_isolate(Unit *u) _pure_; + +int unit_start(Unit *u); +int unit_stop(Unit *u); +int unit_reload(Unit *u); + +int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error); +int unit_kill_common(Unit *u, KillWho who, int signo, pid_t main_pid, pid_t control_pid, sd_bus_error *error); + +typedef enum UnitNotifyFlags { + UNIT_NOTIFY_RELOAD_FAILURE = 1 << 0, + UNIT_NOTIFY_WILL_AUTO_RESTART = 1 << 1, + UNIT_NOTIFY_SKIP_CONDITION = 1 << 2, +} UnitNotifyFlags; + +void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags); + +int unit_watch_pid(Unit *u, pid_t pid, bool exclusive); +void unit_unwatch_pid(Unit *u, pid_t pid); +void unit_unwatch_all_pids(Unit *u); + +int unit_enqueue_rewatch_pids(Unit *u); +void unit_dequeue_rewatch_pids(Unit *u); + +int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name); +int unit_watch_bus_name(Unit *u, const char *name); +void unit_unwatch_bus_name(Unit *u, const char *name); + +bool unit_job_is_applicable(Unit *u, JobType j); + +int set_unit_path(const char *p); + +char *unit_dbus_path(Unit *u); +char *unit_dbus_path_invocation_id(Unit *u); + +int unit_load_related_unit(Unit *u, const char *type, Unit **_found); + +bool unit_can_serialize(Unit *u) _pure_; + +int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs); +int unit_deserialize(Unit *u, FILE *f, FDSet *fds); +int unit_deserialize_skip(FILE *f); + +int unit_add_node_dependency(Unit *u, const char *what, UnitDependency d, UnitDependencyMask mask); +int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask); + +int unit_coldplug(Unit *u); +void unit_catchup(Unit *u); + +void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *unit_status_msg_format) _printf_(4, 0); + +bool unit_need_daemon_reload(Unit *u); + +void unit_reset_failed(Unit *u); + +Unit *unit_following(Unit *u); +int unit_following_set(Unit *u, Set **s); + +const char *unit_slice_name(Unit *u); + +bool unit_stop_pending(Unit *u) _pure_; +bool unit_inactive_or_pending(Unit *u) _pure_; +bool unit_active_or_pending(Unit *u); +bool unit_will_restart_default(Unit *u); +bool unit_will_restart(Unit *u); + +int unit_add_default_target_dependency(Unit *u, Unit *target); + +void unit_start_on_failure(Unit *u); +void unit_trigger_notify(Unit *u); + +UnitFileState unit_get_unit_file_state(Unit *u); +int unit_get_unit_file_preset(Unit *u); + +Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target); +void unit_ref_unset(UnitRef *ref); + +#define UNIT_DEREF(ref) ((ref).target) +#define UNIT_ISSET(ref) (!!(ref).target) + +int unit_patch_contexts(Unit *u); + +ExecContext *unit_get_exec_context(Unit *u) _pure_; +KillContext *unit_get_kill_context(Unit *u) _pure_; +CGroupContext *unit_get_cgroup_context(Unit *u) _pure_; + +ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_; + +int unit_setup_exec_runtime(Unit *u); +int unit_setup_dynamic_creds(Unit *u); + +char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf); +char* unit_concat_strv(char **l, UnitWriteFlags flags); + +int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data); +int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5); + +int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien); + +int unit_make_transient(Unit *u); + +int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask); + +bool unit_type_supported(UnitType t); + +bool unit_is_pristine(Unit *u); + +bool unit_is_unneeded(Unit *u); + +pid_t unit_control_pid(Unit *u); +pid_t unit_main_pid(Unit *u); + +void unit_warn_if_dir_nonempty(Unit *u, const char* where); +int unit_fail_if_noncanonical(Unit *u, const char* where); + +int unit_test_start_limit(Unit *u); + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid); +void unit_unref_uid_gid(Unit *u, bool destroy_now); + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid); + +int unit_acquire_invocation_id(Unit *u); + +bool unit_shall_confirm_spawn(Unit *u); + +int unit_set_exec_params(Unit *s, ExecParameters *p); + +int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret); +int unit_fork_and_watch_rm_rf(Unit *u, char **paths, pid_t *ret_pid); + +void unit_remove_dependencies(Unit *u, UnitDependencyMask mask); + +void unit_export_state_files(Unit *u); +void unit_unlink_state_files(Unit *u); + +int unit_prepare_exec(Unit *u); + +int unit_log_leftover_process_start(pid_t pid, int sig, void *userdata); +int unit_log_leftover_process_stop(pid_t pid, int sig, void *userdata); +int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func); + +bool unit_needs_console(Unit *u); + +const char *unit_label_path(const Unit *u); + +int unit_pid_attachable(Unit *unit, pid_t pid, sd_bus_error *error); + +static inline bool unit_has_job_type(Unit *u, JobType type) { + return u && u->job && u->job->type == type; +} + +/* unit_log_skip is for cases like ExecCondition= where a unit is considered "done" + * after some execution, rather than succeeded or failed. */ +void unit_log_skip(Unit *u, const char *result); +void unit_log_success(Unit *u); +void unit_log_failure(Unit *u, const char *result); +static inline void unit_log_result(Unit *u, bool success, const char *result) { + if (success) + unit_log_success(u); + else + unit_log_failure(u, result); +} + +void unit_log_process_exit(Unit *u, const char *kind, const char *command, bool success, int code, int status); + +int unit_exit_status(Unit *u); +int unit_success_action_exit_status(Unit *u); +int unit_failure_action_exit_status(Unit *u); + +int unit_test_trigger_loaded(Unit *u); + +void unit_destroy_runtime_data(Unit *u, const ExecContext *context); +int unit_clean(Unit *u, ExecCleanMask mask); +int unit_can_clean(Unit *u, ExecCleanMask *ret_mask); + +bool unit_can_freeze(Unit *u); +int unit_freeze(Unit *u); +void unit_frozen(Unit *u); + +int unit_thaw(Unit *u); +void unit_thawed(Unit *u); + +int unit_freeze_vtable_common(Unit *u); +int unit_thaw_vtable_common(Unit *u); + +/* Macros which append UNIT= or USER_UNIT= to the message */ + +#define log_unit_full_errno(unit, level, error, ...) \ + ({ \ + const Unit *_u = (unit); \ + (log_get_max_level() < LOG_PRI(level)) ? -ERRNO_VALUE(error) : \ + _u ? log_object_internal(level, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \ + log_internal(level, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \ + }) + +#define log_unit_full(unit, level, ...) (void) log_unit_full_errno(unit, level, 0, __VA_ARGS__) + +#define log_unit_debug(unit, ...) log_unit_full_errno(unit, LOG_DEBUG, 0, __VA_ARGS__) +#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, __VA_ARGS__) +#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, __VA_ARGS__) +#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, __VA_ARGS__) +#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, __VA_ARGS__) + +#define log_unit_debug_errno(unit, error, ...) log_unit_full_errno(unit, LOG_DEBUG, error, __VA_ARGS__) +#define log_unit_info_errno(unit, error, ...) log_unit_full_errno(unit, LOG_INFO, error, __VA_ARGS__) +#define log_unit_notice_errno(unit, error, ...) log_unit_full_errno(unit, LOG_NOTICE, error, __VA_ARGS__) +#define log_unit_warning_errno(unit, error, ...) log_unit_full_errno(unit, LOG_WARNING, error, __VA_ARGS__) +#define log_unit_error_errno(unit, error, ...) log_unit_full_errno(unit, LOG_ERR, error, __VA_ARGS__) + +#define LOG_UNIT_MESSAGE(unit, fmt, ...) "MESSAGE=%s: " fmt, (unit)->id, ##__VA_ARGS__ +#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id +#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string + +const char* collect_mode_to_string(CollectMode m) _const_; +CollectMode collect_mode_from_string(const char *s) _pure_; diff --git a/src/core/user.conf.in b/src/core/user.conf.in new file mode 100644 index 0000000..bbe0631 --- /dev/null +++ b/src/core/user.conf.in @@ -0,0 +1,47 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# You can override the directives in this file by creating files in +# /etc/systemd/user.conf.d/*.conf. +# +# See systemd-user.conf(5) for details + +[Manager] +#LogLevel=info +#LogTarget=console +#LogColor=yes +#LogLocation=no +#LogTime=no +#SystemCallArchitectures= +#TimerSlackNSec= +#StatusUnitFormat=@STATUS_UNIT_FORMAT_DEFAULT@ +#DefaultTimerAccuracySec=1min +#DefaultStandardOutput=inherit +#DefaultStandardError=inherit +#DefaultTimeoutStartSec=90s +#DefaultTimeoutStopSec=90s +#DefaultTimeoutAbortSec= +#DefaultRestartSec=100ms +#DefaultStartLimitIntervalSec=10s +#DefaultStartLimitBurst=5 +#DefaultEnvironment= +#DefaultLimitCPU= +#DefaultLimitFSIZE= +#DefaultLimitDATA= +#DefaultLimitSTACK= +#DefaultLimitCORE= +#DefaultLimitRSS= +#DefaultLimitNOFILE= +#DefaultLimitAS= +#DefaultLimitNPROC= +#DefaultLimitMEMLOCK= +#DefaultLimitLOCKS= +#DefaultLimitSIGPENDING= +#DefaultLimitMSGQUEUE= +#DefaultLimitNICE= +#DefaultLimitRTPRIO= +#DefaultLimitRTTIME= |